1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/mmad.cl"
19 #ifdef LIGHTWEIGHT_QUANTIZATION
21 #define QUANTIZATION \
22 out_write_N2K4[0].s0 = convert_uchar_sat((float)outvec0.s0 * SCALE + bias_f.s0); /*K= lane_id,N=0*/ \
23 out_write_N2K4[0].s1 = convert_uchar_sat((float)outvec1.s0 * SCALE + bias_f.s1); /*K= lane_id + 8,N=0*/\
24 out_write_N2K4[0].s2 = convert_uchar_sat((float)outvec2.s0 * SCALE + bias_f.s2); /*K= lane_id + 16,N=0*/\
25 out_write_N2K4[0].s3 = convert_uchar_sat((float)outvec3.s0 * SCALE + bias_f.s3); /*K= lane_id + 24,N=0*/\
27 out_write_N2K4[0].s4 = convert_uchar_sat((float)outvec0.s1 * SCALE + bias_f.s0); /*K= lane_id,N=1*/\
28 out_write_N2K4[0].s5 = convert_uchar_sat((float)outvec1.s1 * SCALE + bias_f.s1); /*K= lane_id + 8,N=1*/\
29 out_write_N2K4[0].s6 = convert_uchar_sat((float)outvec2.s1 * SCALE + bias_f.s2); /*K= lane_id + 16,N=1*/\
30 out_write_N2K4[0].s7 = convert_uchar_sat((float)outvec3.s1 * SCALE + bias_f.s3); /*K= lane_id + 24,N=1*/\
32 out_write_N2K4[1].s0 = convert_uchar_sat((float)outvec0.s2 * SCALE + bias_f.s0); /*K= lane_id,N=2*/\
33 out_write_N2K4[1].s1 = convert_uchar_sat((float)outvec1.s2 * SCALE + bias_f.s1); /*K= lane_id + 8,N=2*/\
34 out_write_N2K4[1].s2 = convert_uchar_sat((float)outvec2.s2 * SCALE + bias_f.s2); /*K= lane_id + 16,N=2*/\
35 out_write_N2K4[1].s3 = convert_uchar_sat((float)outvec3.s2 * SCALE + bias_f.s3); /*K= lane_id + 24,N=2*/\
37 out_write_N2K4[1].s4 = convert_uchar_sat((float)outvec0.s3 * SCALE + bias_f.s0); /*K= lane_id,N=3*/\
38 out_write_N2K4[1].s5 = convert_uchar_sat((float)outvec1.s3 * SCALE + bias_f.s1); /*K= lane_id + 8,N=3*/\
39 out_write_N2K4[1].s6 = convert_uchar_sat((float)outvec2.s3 * SCALE + bias_f.s2); /*K= lane_id + 16,N=3*/\
40 out_write_N2K4[1].s7 = convert_uchar_sat((float)outvec3.s3 * SCALE + bias_f.s3); /*K= lane_id + 24,N=3*/
44 #define QUANTIZATION \
45 out_write_N2K4[0].s0 = convert_uchar_sat(outvec0.s0); /*K= lane_id,N=0*/ \
46 out_write_N2K4[0].s1 = convert_uchar_sat(outvec1.s0); /*K= lane_id + 8,N=0*/\
47 out_write_N2K4[0].s2 = convert_uchar_sat(outvec2.s0); /*K= lane_id + 16,N=0*/\
48 out_write_N2K4[0].s3 = convert_uchar_sat(outvec3.s0); /*K= lane_id + 24,N=0*/\
50 out_write_N2K4[0].s4 = convert_uchar_sat(outvec0.s1); /*K= lane_id,N=1*/\
51 out_write_N2K4[0].s5 = convert_uchar_sat(outvec1.s1); /*K= lane_id + 8,N=1*/\
52 out_write_N2K4[0].s6 = convert_uchar_sat(outvec2.s1); /*K= lane_id + 16,N=1*/\
53 out_write_N2K4[0].s7 = convert_uchar_sat(outvec3.s1); /*K= lane_id + 24,N=1*/\
55 out_write_N2K4[1].s0 = convert_uchar_sat(outvec0.s2); /*K= lane_id,N=2*/\
56 out_write_N2K4[1].s1 = convert_uchar_sat(outvec1.s2); /*K= lane_id + 8,N=2*/\
57 out_write_N2K4[1].s2 = convert_uchar_sat(outvec2.s2); /*K= lane_id + 16,N=2*/\
58 out_write_N2K4[1].s3 = convert_uchar_sat(outvec3.s2); /*K= lane_id + 24,N=2*/\
60 out_write_N2K4[1].s4 = convert_uchar_sat(outvec0.s3); /*K= lane_id,N=3*/\
61 out_write_N2K4[1].s5 = convert_uchar_sat(outvec1.s3); /*K= lane_id + 8,N=3*/\
62 out_write_N2K4[1].s6 = convert_uchar_sat(outvec2.s3); /*K= lane_id + 16,N=3*/\
63 out_write_N2K4[1].s7 = convert_uchar_sat(outvec3.s3); /*K= lane_id + 24,N=3*/
67 #define QUANTIZATION \
68 out_write_N2K4[0].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=0*/ \
69 out_write_N2K4[0].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=0*/\
70 out_write_N2K4[0].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=0*/\
71 out_write_N2K4[0].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=0*/\
73 out_write_N2K4[0].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=1*/\
74 out_write_N2K4[0].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=1*/\
75 out_write_N2K4[0].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=1*/\
76 out_write_N2K4[0].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=1*/\
78 out_write_N2K4[1].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=2*/\
79 out_write_N2K4[1].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=2*/\
80 out_write_N2K4[1].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=2*/\
81 out_write_N2K4[1].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=2*/\
83 out_write_N2K4[1].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=3*/\
84 out_write_N2K4[1].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=3*/\
85 out_write_N2K4[1].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=3*/\
86 out_write_N2K4[1].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=3*/
91 #define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C)
92 #define _OD OUTPUT_FEATURE_NUM
93 #define _OW OUTPUT_SIZE_X
94 #define _OH OUTPUT_SIZE_Y
95 #define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
96 #define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
97 #define _IH INPUT0_SIZE_Y
98 #define _IW INPUT0_SIZE_X
99 #define _ID INPUT0_FEATURE_NUM
100 #define K_HEIGHT FILTER_SIZE_Y
101 #define K_WIDTH FILTER_SIZE_X
102 #define BATCH_SIZE OUTPUT_BATCH_NUM
104 #define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
105 #define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
106 #define K_STRIDE STRIDE_SIZE_X
109 // for now kernel stride is square
110 #define K_WSTRIDE K_STRIDE
111 #define K_HSTRIDE K_STRIDE
116 __attribute__((intel_reqd_sub_group_size(8)))
117 KERNEL(convolution_mmad_slm_2x14_rep4)(
118 __global int8 *inputs,
119 __global uchar* outputs,
120 __global int8* weights,
122 __global BIAS_TYPE* biases,
124 #if QUANTIZATION_TERM
125 const __global float* quantizations,
128 const __global float* calibrations,
133 const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z;
134 const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y;
136 ushort fmg = get_group_id(0); // Output Depth
137 ushort group_y = get_group_id(1); // Output Width
138 ushort group_z = get_group_id(2); // Output Height
140 /* 16,1,8 WG , SIMD8 - 16 HW threads in a WG
141 threads 0-1 : ( lid_x:0-15,lid_y:0,lid_z:0)
142 threads 2-3 : ( lid_x:0-15,lid_y:0,lid_z:1)
144 threads 12-13: ( lid_x:0-15, lid_y:0,lid_z:6)
145 threads 14-15: ( lid_x:0-15, lid_y:0,lid_z:7)
148 /* Thread, local IDs */
149 ushort thread_id = get_sub_group_id();
150 ushort threadid_mod_2 = thread_id % 2;
151 ushort threadid_mod_8 = thread_id % 8;
153 ushort lid_x = get_local_id(0);
154 ushort lid_z = get_local_id(2);
156 uchar lane_id = get_sub_group_local_id();
158 /* 32-bit signed accumulator , 112 output registers for 1Px7Qx4Nx32K output tile size
159 Will be converted to 8-bits before final write */
161 int4 out_07 [ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 0-7
162 int4 out_815[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 8-15
163 int4 out_1623[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 16-23
164 int4 out_2431[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 24-31
166 /* Account for batching */
168 ushort batch = ( fmg*LOCAL_SIZE_X*4 ) /_OD; // Each thread processing 32 output_channels and each fmg processing 64 output channels , LOCAL_SIZE_X is only 16
170 // Size calculated for int8 elements
171 uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ;
173 uint in_addr_offset = batch*input_size;
175 /* Goto activation tile for work group, offset is w.r.t int8 array */
177 uint groupy_tile = TILE_W*group_y;
178 uint groupz_tile = TILE_H*group_z;
180 in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK;
182 /* SLM space for Activation, Weights
183 ( 16,1,8 ) Workgroup - 7 tiles along Y direction and 64 different output channels
184 2 threads used to load global memory
185 Activation - 9Hx9Wx4Nx32C Weights -3Rx3Sx64Kx32C */
187 __local int8 act_slm [ 9*9*4 ];
188 __local int8 weight_slm [ 9*64 ];
190 /* 9Hx9Wx4Nx32C activation tile written into SLM. Distribute among 14 threads in Workgroup
191 threads 0-1 write 9x4x32 of H=0, W=0...8
192 threads 2-3 write 9x4x32 of H=1, W=0...8
193 threads 4-5 write 9x4x32 of H=2, W=0...8
194 threads 6-7 write 9x4x32 of H=3, W=0...8
195 threads 8-9 write 9x4x32 of H=4, W=0...8
196 threads 10-11 write 9x4x32 of H=5,W=0...8
197 threads 12-13 write 9x4x32 of H=6,W=0...8
198 threads 14 write 9x4x32 of H=7,W=0...8
199 threads 15 write 9x4x32 of H=8,W=0...8 */
201 /* Goto activation tile for thread in group */
203 uint row_offset = thread_id / 2;
205 if ( thread_id >= 14 )
210 // In addr offset for the particular thread
211 in_addr_offset += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ;
213 /* Activation SLM indices */
214 uint act_slm_write = row_offset * ( TILE_W + 2) * BATCH_PACK;
215 uint act_slm_read = OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ;
217 /* 9RSx64Kx32C Weight Block in SLM
218 thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0,1 ( k=0..15 )
219 thread1 handles w(0,0),w(0,1),w(0,2) of K=2,3 ( k=16..31)
220 thread2 handles w(1,0),w(1,1) of K=0,1 ( k=0..15)
221 thread3 handles w(1,0),w(1,1) of K=2,3 ( k=16..31)
222 thread4 handles w(1,2),w(2,0) of K=0,1 ( k=0..15)
223 thread5 handles w(1,2),w(2,0) of K=2,3 ( k=16..31)
224 thread6 handles w(2,1),w(2,2) of K=0,1 ( k=0..15)
225 thread7 handles w(2,1),w(2,2) of K=2,3 ( k=16..31)
227 Similarly threads8-15 handles for K=4,5,6,7
231 w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=32..39,C=0..15)
232 w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=32..39,C=16..31)
234 Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM
235 Thread0 will read k=0..31, thread1 will read k=32..63
237 First all output channels are present in SLM, then next weight pixel is present in SLM */
239 #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
241 uint output_depth = fmg % ( _OD / ( LOCAL_SIZE_X * 4 ) ); //LOCAL_SIZE_X=16, 64 output channels used
243 uint weight_size_CRS = ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside
245 // Global weight addr for workgroup
246 uint weight_global_addr_offset = output_depth * 8 * weight_size_CRS ; //64 output channels per workgroup
248 /* Global weight address for thread */
250 // Goto appropriate output channel in weights
251 uint weight_global_channel_offset = threadid_mod_2 * 2 * weight_size_CRS ;
253 uint slm_channel_offset = threadid_mod_2;
254 uint bc_fused_thread_offset = 0;
256 if ( thread_id >= 8 )
258 bc_fused_thread_offset = 1;
260 weight_global_channel_offset = 4 * weight_size_CRS + slm_channel_offset * weight_size_CRS * 2 ;
263 // Goto appropriate pixel in weights
265 uint weight_global_pixel_offset = 0;
266 uint slm_pixel_offset = 0;
268 if ( threadid_mod_8 >=2 )
270 /* First three pixels handled by threads 0-1, then 2 pixels handled by two threads */
272 weight_global_pixel_offset = 3*8 + ( ( (threadid_mod_8/2) - 1 )*2*8 );
273 slm_pixel_offset = 3*64 + ( ( (threadid_mod_8/2) - 1 )*2*64 );
276 weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset;
278 /* Weight slm write index */
280 uint slm_write_weight = slm_pixel_offset + slm_channel_offset * 32 + bc_fused_thread_offset * 4;
282 /* Weight slm read index */
284 /* Thread 0 reads output channels 0-15, thread 1 handles output channels 16-31, data present in interleaved
288 w(0,0) C=0..7, K = 0..7 | w(0,0) C=0..7, K = 32..39
289 w(0,0) C=8..15,K=0..7 | w(0,0) C=8..15,K = 32..39
290 w(0,0) C=0..7, K=8..15 | w(0,0) C=0..7, K = 40..47
291 w(0,0) C=8..15,K=8..15 | w(0,0) C=8..15,K= 40..47
294 uint wt_slm_rd_offset = threadid_mod_2*4;
298 __attribute__((opencl_unroll_hint(1)))
299 for(kd = 0; kd < ( _ID / PACK ) ; kd++)
302 /* Load Activation from global to SLM */
304 int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset;
306 __global uint *activation_tile = (__global uint*)&inputs[ in_addr ];
308 __local uint *act_slm_ptr = (__local uint *) &act_slm [ act_slm_write ];
310 /* The odd thread in fused pair will start from next 4x8 block */
312 activation_tile += threadid_mod_2*4*8;
313 act_slm_ptr += threadid_mod_2*4*8;
315 int4 act_col_0 = as_int4( intel_sub_group_block_read4(activation_tile) );//col 0
316 int4 act_col_1 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );//col 2
317 int4 act_col_2 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );//col 4
318 int4 act_col_3 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );//col 6
320 SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) );
321 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) );
322 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) );
323 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) );
325 if ( threadid_mod_2 == 0 )
327 int4 act_col_4 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
329 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) );
334 activation_tile = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8;
335 act_slm_ptr = act_slm_ptr + (TILE_W + 2) * BATCH_PACK *8;
337 int4 act_col_9 = as_int4( intel_sub_group_block_read4(activation_tile) );
338 int4 act_col_10 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );
339 int4 act_col_11 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );
340 int4 act_col_12 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );
342 SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_9 ) );
343 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_10 ) );
344 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) );
345 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) );
347 if ( threadid_mod_2 == 0 )
349 int4 act_col_13 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
351 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) );
355 /* load weights from global to weight_slm */
357 int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset;
359 __global uint *weight_tile = (__global uint*)&weights [ weight_addr ];
360 __local uint *wt_slm_ptr = (__local uint *)&weight_slm [ slm_write_weight ];
362 __global uint *weight_tile_2 = weight_tile;
363 __local uint *wt_slm_ptr_2 = wt_slm_ptr;
365 int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); // Pixel1 K=0..7 C=0..15
366 int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=0..7 C=16..31
367 int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=0..7 C=0..15
368 int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=0..7 C=16..31
370 // Goto next output channel
371 weight_tile += weight_size_CRS*8;
373 int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); // Pixel1 K=8..15 C=0..15
374 int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=8..15 C=16..31
375 int4 w6 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=8..15 C=0..15
376 int4 w7 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=8..15 C=16..31
378 SLM_BLOCK_WRITE_4 ( wt_slm_ptr, as_uint4 ( w0 ) );
379 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) );
380 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ), as_uint4 ( w2 ) );
381 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ), as_uint4 ( w3 ) );
385 SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) );
386 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w5 ) );
387 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ) , as_uint4 ( w6 ) );
388 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ) , as_uint4 ( w7 ) );
390 if( threadid_mod_8 < 2 )
393 weight_tile_2 += 16*8;
394 wt_slm_ptr_2 += 2*64*8;
396 int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) ); // Pixel1 K=0..7 C=0..15
397 int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) ); // Pixel1 K=0..7 C=16..31
399 // Goto next output channel
400 weight_tile_2 += weight_size_CRS*8;
402 int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) ); // Pixel1 K=8..15 C=0..15
403 int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) ); // Pixel1 K=8..15 C=16..31
405 SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2, as_uint4 ( w0 ) );
406 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w1 ) );
408 wt_slm_ptr_2 += 16*8;
410 SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2 , as_uint4 ( w4 ) );
411 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w5 ) );
415 // Synchronize SLM writes across workgroup
416 barrier(CLK_LOCAL_MEM_FENCE);
420 uint wt_slm_rd = wt_slm_rd_offset;
422 __local uint *slm_ptr0 = (__local uint *) &act_slm[ act_slm_read ];
423 __local uint *slm_ptr1 = (__local uint *) &weight_slm[ wt_slm_rd ];
425 /* balancing load of weights, activations */
426 int8 weights_reg[3]; //24 registers
427 int4 act_reg[18]; //72 registers
428 uint slm_read_pixel_offset = 64*8;
430 /**********************************************************************************************************
431 First phase - multiply first row of weights and 1st row of activations
432 ***********************************************************************************************************/
434 /* Load weights from SLM into registers - row0, output channels 0..7 */
437 __local uint *slm_ptrw0 = slm_ptr1;
439 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
440 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
441 slm_ptrw0 += slm_read_pixel_offset;
443 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
444 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
445 slm_ptrw0 += slm_read_pixel_offset;
447 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
448 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
451 /* load 1Hx9Wx4N inputs, Activation row0 */
453 __attribute__((opencl_unroll_hint(9)))
454 for (int ic = 0; ic < 9; ic++)
456 /* Load activations from SLM into registers */
458 uint slm_offset = ic * BATCH_PACK * 8 ;
460 act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
465 /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/
467 /* Output channels 0-7 */
469 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
470 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
471 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
472 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
473 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
474 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
475 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
477 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
478 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
479 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
480 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
481 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
482 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
483 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
485 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
486 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
487 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
488 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
489 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
490 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
491 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
493 /* Load weights from SLM into registers - row0, output channels 8..15 */
496 __local uint *slm_ptrw0 = slm_ptr1 + 2*8*8;
498 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
499 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
500 slm_ptrw0 += slm_read_pixel_offset;
502 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
503 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
504 slm_ptrw0 += slm_read_pixel_offset;
506 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
507 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
510 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
511 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
512 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
513 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
514 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
515 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
516 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
518 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
519 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
520 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
521 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
522 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
523 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
524 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
526 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
527 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
528 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
529 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
530 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
531 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
532 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
534 /* Load weights from SLM into registers - row0, output channels 16..23 */
536 __local uint *slm_ptrw0 = slm_ptr1 + 4*8*8;
538 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
539 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
540 slm_ptrw0 += slm_read_pixel_offset;
542 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
543 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
544 slm_ptrw0 += slm_read_pixel_offset;
546 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
547 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
550 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
551 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
552 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
553 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
554 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
555 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
556 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
558 /* load 1Hx9Wx4N inputs, Activation row1 */
560 uint slm_row_offset_2 = 1*(TILE_W + 2)*BATCH_PACK*8;
562 __attribute__((opencl_unroll_hint(9)))
563 for (int ic = 0; ic < 9; ic++)
565 /* Load activations from SLM into registers */
567 uint slm_offset = slm_row_offset_2 + ic * BATCH_PACK * 8 ;
569 act_reg [ ic + 9 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
572 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
573 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
574 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
575 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
576 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
577 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
578 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
580 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
581 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
582 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
583 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
584 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
585 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
586 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
588 /* Load weights from SLM into registers - row0, output channels 24..31 */
590 __local uint *slm_ptrw0 = slm_ptr1 + 6*8*8;
592 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
593 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
594 slm_ptrw0 += slm_read_pixel_offset;
596 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
597 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
598 slm_ptrw0 += slm_read_pixel_offset;
600 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
601 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
604 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
605 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
606 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
607 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
608 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
609 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
610 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
612 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
613 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
614 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
615 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
616 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
617 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
618 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
620 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
621 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
622 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
623 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
624 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
625 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
626 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
628 /**********************************************************************************************************
629 Second phase - multiply second row of weights and second row of activations
630 ***********************************************************************************************************/
632 /* Load weights from SLM into registers - row1, output channels 0..7 */
634 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset;
636 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
637 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
638 slm_ptrw1 += slm_read_pixel_offset;
640 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
641 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
642 slm_ptrw1 += slm_read_pixel_offset;
644 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
645 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
648 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[9], weights_reg[0] );
649 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[10], weights_reg[0] );
650 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[11], weights_reg[0] );
651 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[12], weights_reg[0] );
652 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[13], weights_reg[0] );
653 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[14], weights_reg[0] );
654 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[15], weights_reg[0] );
656 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[10], weights_reg[1] );
657 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[11], weights_reg[1] );
658 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[12], weights_reg[1] );
659 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[13], weights_reg[1] );
660 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[14], weights_reg[1] );
661 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[15], weights_reg[1] );
662 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[16], weights_reg[1] );
664 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[11], weights_reg[2] );
665 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[12], weights_reg[2] );
666 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[13], weights_reg[2] );
667 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[14], weights_reg[2] );
668 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[15], weights_reg[2] );
669 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[16], weights_reg[2] );
670 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[17], weights_reg[2] );
672 /* Load weights from SLM into registers - row1, output channels 8..15 */
674 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 2*8*8;
676 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
677 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
678 slm_ptrw1 += slm_read_pixel_offset;
680 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
681 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
682 slm_ptrw1 += slm_read_pixel_offset;
684 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
685 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
688 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[9], weights_reg[0] );
689 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[10], weights_reg[0] );
690 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[11], weights_reg[0] );
691 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[12], weights_reg[0] );
692 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[13], weights_reg[0] );
693 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[14], weights_reg[0] );
694 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[15], weights_reg[0] );
696 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[10], weights_reg[1] );
697 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[11], weights_reg[1] );
698 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[12], weights_reg[1] );
699 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[13], weights_reg[1] );
700 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[14], weights_reg[1] );
701 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[15], weights_reg[1] );
702 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[16], weights_reg[1] );
704 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[11], weights_reg[2] );
705 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[12], weights_reg[2] );
706 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[13], weights_reg[2] );
707 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[14], weights_reg[2] );
708 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[15], weights_reg[2] );
709 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[16], weights_reg[2] );
710 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[17], weights_reg[2] );
712 /* Load weights from SLM into registers - row1, output channels 16..23 */
714 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 4*8*8;
716 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
717 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
718 slm_ptrw1 += slm_read_pixel_offset;
720 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
721 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
722 slm_ptrw1 += slm_read_pixel_offset;
724 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
725 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
728 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[9], weights_reg[0] );
729 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[10], weights_reg[0] );
730 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[11], weights_reg[0] );
731 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[12], weights_reg[0] );
732 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[13], weights_reg[0] );
733 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[14], weights_reg[0] );
734 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[15], weights_reg[0] );
736 /* load 1Hx9Wx4N inputs, Activation row2 */
738 uint slm_row_offset_3 = 2*(TILE_W + 2)*BATCH_PACK*8;
740 __attribute__((opencl_unroll_hint(9)))
741 for (int ic = 0; ic < 9; ic++)
743 /* Load activations from SLM into registers */
745 uint slm_offset = slm_row_offset_3 + ic * BATCH_PACK * 8 ;
747 act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
750 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[10], weights_reg[1] );
751 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[11], weights_reg[1] );
752 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[12], weights_reg[1] );
753 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[13], weights_reg[1] );
754 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[14], weights_reg[1] );
755 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[15], weights_reg[1] );
756 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[16], weights_reg[1] );
758 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[11], weights_reg[2] );
759 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[12], weights_reg[2] );
760 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[13], weights_reg[2] );
761 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[14], weights_reg[2] );
762 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[15], weights_reg[2] );
763 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[16], weights_reg[2] );
764 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[17], weights_reg[2] );
766 /* Load weights from SLM into registers - row1, output channels 24..31 */
768 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 6*8*8;
770 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
771 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
772 slm_ptrw1 += slm_read_pixel_offset;
774 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
775 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
776 slm_ptrw1 += slm_read_pixel_offset;
778 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
779 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
782 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[9], weights_reg[0] );
783 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[10], weights_reg[0] );
784 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[11], weights_reg[0] );
785 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[12], weights_reg[0] );
786 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[13], weights_reg[0] );
787 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[14], weights_reg[0] );
788 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[15], weights_reg[0] );
790 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[10], weights_reg[1] );
791 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[11], weights_reg[1] );
792 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[12], weights_reg[1] );
793 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[13], weights_reg[1] );
794 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[14], weights_reg[1] );
795 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[15], weights_reg[1] );
796 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[16], weights_reg[1] );
798 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[11], weights_reg[2] );
799 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[12], weights_reg[2] );
800 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[13], weights_reg[2] );
801 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[14], weights_reg[2] );
802 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[15], weights_reg[2] );
803 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[16], weights_reg[2] );
804 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[17], weights_reg[2] );
806 /**********************************************************************************************************
807 Third phase - multiply third row of weights and third row of activations
808 ***********************************************************************************************************/
810 /* Load weights from SLM into registers - row2, output channels 0..7 */
812 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset;
814 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
815 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
816 slm_ptrw2 += slm_read_pixel_offset;
818 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
819 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
820 slm_ptrw2 += slm_read_pixel_offset;
822 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
823 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
826 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
827 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
828 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
829 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
830 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
831 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
832 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
834 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
835 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
836 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
837 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
838 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
839 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
840 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
842 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
843 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
844 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
845 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
846 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
847 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
848 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
850 /* Load weights from SLM into registers - row2, output channels 8..15 */
852 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 2*8*8;
854 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
855 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
856 slm_ptrw2 += slm_read_pixel_offset;
858 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
859 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
860 slm_ptrw2 += slm_read_pixel_offset;
862 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
863 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
866 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
867 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
868 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
869 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
870 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
871 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
872 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
874 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
875 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
876 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
877 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
878 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
879 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
880 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
882 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
883 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
884 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
885 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
886 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
887 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
888 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
890 /* Load weights from SLM into registers - row2, output channels 16..23 */
892 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 4*8*8;
894 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
895 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
896 slm_ptrw2 += slm_read_pixel_offset;
898 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
899 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
900 slm_ptrw2 += slm_read_pixel_offset;
902 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
903 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
906 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
907 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
908 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
909 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
910 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
911 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
912 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
914 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
915 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
916 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
917 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
918 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
919 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
920 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
922 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
923 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
924 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
925 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
926 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
927 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
928 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
930 /* Load weights from SLM into registers - row3, output channels 24..31 */
932 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 6*8*8;
934 weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
935 weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
936 slm_ptrw2 += slm_read_pixel_offset;
938 weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
939 weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
940 slm_ptrw2 += slm_read_pixel_offset;
942 weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
943 weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
946 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
947 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
948 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
949 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
950 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
951 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
952 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
954 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
955 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
956 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
957 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
958 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
959 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
960 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
962 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
963 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
964 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
965 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
966 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
967 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
968 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
971 // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM
972 barrier(CLK_LOCAL_MEM_FENCE);
975 /****************************************************************************************************************
976 *******************************Output Write Stage****************************************************************
977 ****************************************************************************************************************/
979 Outputs will be passed through activation function and quantized to 8 bits before writing
980 Output will be in same format as input [K/32][N/4][P][Q][4N][32K] */
982 /******************* Write output to SLM *************************************/
984 /* Quantize and pack 4x1 byte - from consectuive n-coordinates
985 Each thread produces [1P][7Q][4N][32K]
986 Write uint32 from each lane to SLM , the entire thread will write 32-consecutive K-coorindates
988 Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK )
989 In SLM 7x7x4x32 present first then the next 32 channels
994 /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */
995 uint row_size_bytes = (_OW + OWPAD) * PACK * BATCH_PACK;
997 /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */
998 uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD);
1000 /* Each output_depth WG writes 64 output channels */
1002 uint output_depth_index = output_depth*2 + threadid_mod_2;
1003 uint batch_index = batch;
1005 /* Each WG produces entire 7x7 output, hence no group_y, group_z tiling */
1007 uint output_offset_x = groupy_tile * OUT_X_PITCH;
1008 uint output_offset_y = groupz_tile * OUT_Y_PITCH;
1009 uint slice_pack_addr_bytes = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + lid_z * row_size_bytes;
1011 __global uchar* output_write_ptr = (__global uchar *) &outputs [ slice_pack_addr_bytes + output_offset_x + output_offset_y ];
1013 const uint feature = output_depth_index * 32 + get_sub_group_local_id();
1015 const float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
1016 const float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
1017 const float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
1019 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
1020 for (int col = 0; col < OUT_BLOCK_WIDTH; col++)
1023 int4 outvec0 = out_07[col];
1024 int4 outvec1 = out_815[col];
1025 int4 outvec2 = out_1623[col];
1026 int4 outvec3 = out_2431[col];
1028 /* Non-Linear Activation & Quantization code */
1030 uchar8 out_write_N2K4[2];
1034 intel_sub_group_block_write_uc8 ( output_write_ptr , out_write_N2K4[0] );
1035 output_write_ptr += 64;
1036 intel_sub_group_block_write_uc8 ( output_write_ptr , out_write_N2K4[1] );
1037 output_write_ptr += 64;
1039 } // out_block_width-for loop