1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/mmad.cl"
17 #define SUM_SCALE 0.11f
20 #ifdef LIGHTWEIGHT_QUANTIZATION
22 #define QUANTIZATION(idx) \
25 for(uint z = 0; z < 4; z++)\
27 tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\
28 tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\
29 tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\
30 tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\
32 regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\
33 regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\
34 regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\
35 regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\
41 #define QUANTIZATION(idx) \
42 regC_uchar16.s0 = regC[0 * 4 + i][idx];\
43 regC_uchar16.s1 = regC[1 * 4 + i][idx];\
44 regC_uchar16.s2 = regC[2 * 4 + i][idx];\
45 regC_uchar16.s3 = regC[3 * 4 + i][idx];\
47 regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\
48 regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\
49 regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\
50 regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\
52 regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\
53 regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\
54 regC_uchar16.sa = regC[2 * 4 + i][idx+2];\
55 regC_uchar16.sb = regC[3 * 4 + i][idx+2];\
57 regC_uchar16.sc = regC[0 * 4 + i][idx+3];\
58 regC_uchar16.sd = regC[1 * 4 + i][idx+3];\
59 regC_uchar16.se = regC[2 * 4 + i][idx+3];\
60 regC_uchar16.sf = regC[3 * 4 + i][idx+3];\
63 for(uint s = 0; s <16; s++)\
65 sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\
67 regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\
68 regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\
69 regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\
70 regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\
72 regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\
73 regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\
74 regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\
75 regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\
77 regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\
78 regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\
79 regC_uchar16.sa = convert_uchar_sat( sum.sa );\
80 regC_uchar16.sb = convert_uchar_sat( sum.sb );\
82 regC_uchar16.sc = convert_uchar_sat( sum.sc );\
83 regC_uchar16.sd = convert_uchar_sat( sum.sd );\
84 regC_uchar16.se = convert_uchar_sat( sum.se );\
85 regC_uchar16.sf = convert_uchar_sat( sum.sf );\
90 #define QUANTIZATION(idx) \
91 regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
92 regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
93 regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
94 regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
96 regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
97 regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
98 regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
99 regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
101 regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
102 regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
103 regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
104 regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
106 regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
107 regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
108 regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
109 regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
112 for(uint s = 0; s <16; s++)\
114 sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\
116 regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s0) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\
117 regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s1) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\
118 regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s2) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\
119 regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s3) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\
121 regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s4) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\
122 regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s5) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\
123 regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s6) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\
124 regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s7) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\
126 regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s8) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\
127 regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s9) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\
128 regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sa) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\
129 regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sb) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\
131 regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sc) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\
132 regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sd) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\
133 regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.se) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\
134 regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sf) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\
139 inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
141 #if OUT_WITH_PADDING == 1
142 uint tmp_idx = cOffset;
143 uint f_val_idx = tmp_idx % 32;
145 uint b_val_idx = tmp_idx % 4;
147 uint x_idx = tmp_idx % OUTPUT_SIZE_X;
148 tmp_idx /= OUTPUT_SIZE_X;
149 uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
150 tmp_idx /= OUTPUT_SIZE_Y;
151 uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
152 tmp_idx /= (OUTPUT_BATCH_NUM / 4);
153 uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
155 uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
156 padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
157 padded_offset += y_idx * OUT_Y_PITCH;
158 padded_offset += x_idx * OUT_X_PITCH;
159 padded_offset += b_val_idx * 32;
160 padded_offset += f_val_idx;
161 padded_offset += OUT_OFFSET;
163 return padded_offset;
170 inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY)
172 #if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1
173 uint tmp_idx = cOffset;
174 uint f_val_idx = tmp_idx % 32;
176 uint b_val_idx = tmp_idx % 4;
178 uint x_idx = tmp_idx % OUTPUT_SIZE_X;
180 tmp_idx /= OUTPUT_SIZE_X;
181 uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
183 tmp_idx /= OUTPUT_SIZE_Y;
184 uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
185 tmp_idx /= (OUTPUT_BATCH_NUM / 4);
186 uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
188 uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH;
189 padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH;
190 padded_offset += y_idx * IN2_Y_PITCH;
191 padded_offset += x_idx * IN2_X_PITCH;
192 padded_offset += b_val_idx * 32;
193 padded_offset += f_val_idx;
194 padded_offset += IN2_OFFSET;
196 return padded_offset;
203 inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA,
204 __local int8* l_tileB, const uint l_offsetTileB_col0,
205 const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
206 const uint l_offsetTileB_col3, int8* rowA, int8* colB,
209 // Read tile A from SLM to regA
210 uint l_offsetTileATemp = l_offsetTileA;
211 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
212 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
214 rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
215 l_offsetTileATemp += 8 * SG_SIZE;
217 // Read tile B from SLM to regB and compute mmad
218 colB[0] = l_tileB[l_offsetTileB_col0];
219 colB[1] = l_tileB[l_offsetTileB_col1];
220 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
221 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
224 regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
226 colB[0] = l_tileB[l_offsetTileB_col2];
227 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
228 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
231 regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
233 colB[1] = l_tileB[l_offsetTileB_col3];
234 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
235 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
238 regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
240 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
241 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
244 regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
249 * \brief GEMM kernel to compute MxN matrix using SLM
250 * \param g_inA - Input matrix
251 * \param g_inB - Input matrix
252 * \param g_outC - Output matrix
255 __attribute__((intel_reqd_sub_group_size(SG_SIZE)))
256 KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8_fused_eltwise)
258 __global char* const g_inA,
259 __global int* g_outC,
260 __global char* const g_inB,
262 __global BIAS_TYPE* biases,
264 __global float* quantizations,
266 __global float* calibrations,
269 __global char* const input2,
270 __global float* eltw_calibrations
274 __global int4* const g_matrixA = (__global int4*)g_inA;
275 __global int4* const g_matrixB = (__global int4*)g_inB;
276 __global int8* g_matrixC = (__global int8*)g_outC;
278 // Each work-group works to compute 128x128 tile.
279 // Each work-group contains 16 sub-groups.
280 // Each sub-group within the work-group works to compute a 32x32 tile.
281 // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128).
282 // 2) Each sub-group works to compute 32x32 tileC (stored in regC).
283 // Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
284 // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
285 __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024
286 __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024
288 __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA;
289 __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA;
290 __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB;
292 const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y);
294 const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint);
295 const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8);
296 const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4);
297 const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4);
300 const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY
301 const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX
302 const uint l_tidX = get_local_id(DIM_X); // 0,...,31 in WG
303 const uint l_tidY = get_local_id(DIM_Y); // 0,1,2,3 in WG
304 const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX; // 0,1,2,...127
307 const uint sg_tid = get_sub_group_local_id(); // 0,1,...,8
308 const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); //{0}/8
309 const uint sg_global_idY = g_tidY; //{0}
311 const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3}
312 const uint sg_local_idY = l_tidY; // 0,1,2,3
313 const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4
315 const uint sub_group_id = get_sub_group_id();
319 int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts // (32/8)*4
320 int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
321 int8 colB[2]; // each lane will store 32x4 piece of matrixB
324 const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
325 const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
326 const uint numElements32x8TileB = numElements32x32TileB / 4;
327 const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
328 const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
329 const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
330 const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
331 const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
336 #ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
337 g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid;
338 g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid;
339 g_idxA[1] = g_idxA[0] + l_groupSize;
340 g_idxB[1] = g_idxB[0] + l_groupSize;
341 #else // Row (matrixA) and Col (matrixB) major layout
342 g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) +
343 (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
344 g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) +
345 (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
346 g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
347 g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
352 l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]];
353 l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]];
354 l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]];
355 l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]];
357 #ifdef TILED_GLOBAL_LAYOUT
358 g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
359 g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
360 g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
361 g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
363 g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
364 g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
365 g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
366 g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
369 barrier(CLK_LOCAL_MEM_FENCE);
372 int4 hdcReadValueA[2];
373 int4 hdcReadValueB[2];
375 __attribute__((opencl_unroll_hint(1)))
376 for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
379 * SLM setup - HDC read only
381 // Overlap HDC reads with mmad compute
382 hdcReadValueA[0] = g_matrixA[g_idxA[0]];
383 hdcReadValueB[0] = g_matrixB[g_idxB[0]];
384 hdcReadValueA[1] = g_matrixA[g_idxA[1]];
385 hdcReadValueB[1] = g_matrixB[g_idxB[1]];
387 #ifdef TILED_GLOBAL_LAYOUT
388 g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
389 g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
390 g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
391 g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
393 g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
394 g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
395 g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
396 g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
402 FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint],
403 l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8],
404 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
405 l_offsetTileB_col3, rowA, colB, regC);
408 * SLM setup - SLM write only
410 l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0];
411 l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0];
412 l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1];
413 l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1];
415 barrier(CLK_LOCAL_MEM_FENCE);
419 * Last mmad compute iteration (avoids branching in main loop)
422 FUNC_CALL(mmad_32x32_int8)(
423 &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint],
425 &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8],
426 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB,
429 #ifdef OUTPUT_TILED_GLOBAL_LAYOUT
430 // Write out in swizzled manner after quantizing
431 __global uchar* g_outC_uchar = (__global uchar*)g_outC;
432 uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
433 sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
435 uchar16 regC_uchar16;
436 uint offset_uc16 = 0;
438 const uint workgroup_id_x = get_group_id(0);
439 uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x
440 uint feature = get_sub_group_local_id()*4 + feature_off;
442 float4 quant_f = vload4(0, quantizations + feature);
443 float4 bias_f = vload4(0, biases + feature);
444 float4 calib_f = vload4(0, calibrations + feature);
447 float4 eltw_calib_f = vload4(0, eltw_calibrations + feature);
449 uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))];
450 uint tmpcOff = cOffset;
451 __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
452 for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++)
454 uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff);
456 eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset)));
458 const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y);
459 eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset)));
461 tmpcOff += sizeof(uchar16) * SG_SIZE;
464 #if MMAD_SUPPORTED == 1
465 __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
467 for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
469 uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
471 uchar16 eltw_input_vals = eltw[i * 2];
476 intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16));
477 cOffset += sizeof(uchar16) * SG_SIZE;
479 // now we need to calculate again for other x
480 padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
482 uchar16 eltw_input_vals = eltw[i * 2 + 1];
487 intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) );
488 cOffset += sizeof(uchar16) * SG_SIZE;
491 // Write final accumulated values
492 uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
493 sg_tid * (MATRIX_M / 8);
494 __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
495 for (uint i = 0; i < (SIMD_LANE_N); ++i)
497 __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
498 for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
500 g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
502 cOffset += SG_SIZE * (MATRIX_M / 8);