Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_mmad_slm_7x7_rep4.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/mmad.cl"
16
17 #define SCALE 0.11f
18
19 #ifdef LIGHTWEIGHT_QUANTIZATION
20
21 #define QUANTIZATION \
22     out_write_N2K4[0].s0 = convert_uchar_sat((float)outvec0.s0 * SCALE + bias_f.s0); /*K= lane_id,N=0*/ \
23     out_write_N2K4[0].s1 = convert_uchar_sat((float)outvec1.s0 * SCALE + bias_f.s1); /*K= lane_id + 8,N=0*/\
24     out_write_N2K4[0].s2 = convert_uchar_sat((float)outvec2.s0 * SCALE + bias_f.s2); /*K= lane_id + 16,N=0*/\
25     out_write_N2K4[0].s3 = convert_uchar_sat((float)outvec3.s0 * SCALE + bias_f.s3); /*K= lane_id + 24,N=0*/\
26     \    
27     out_write_N2K4[0].s4 = convert_uchar_sat((float)outvec0.s1 * SCALE + bias_f.s0); /*K= lane_id,N=1*/\
28     out_write_N2K4[0].s5 = convert_uchar_sat((float)outvec1.s1 * SCALE + bias_f.s1); /*K= lane_id + 8,N=1*/\
29     out_write_N2K4[0].s6 = convert_uchar_sat((float)outvec2.s1 * SCALE + bias_f.s2); /*K= lane_id + 16,N=1*/\
30     out_write_N2K4[0].s7 = convert_uchar_sat((float)outvec3.s1 * SCALE + bias_f.s3); /*K= lane_id + 24,N=1*/\
31     \
32     out_write_N2K4[1].s0 = convert_uchar_sat((float)outvec0.s2 * SCALE + bias_f.s0); /*K= lane_id,N=2*/\
33     out_write_N2K4[1].s1 = convert_uchar_sat((float)outvec1.s2 * SCALE + bias_f.s1); /*K= lane_id + 8,N=2*/\
34     out_write_N2K4[1].s2 = convert_uchar_sat((float)outvec2.s2 * SCALE + bias_f.s2); /*K= lane_id + 16,N=2*/\
35     out_write_N2K4[1].s3 = convert_uchar_sat((float)outvec3.s2 * SCALE + bias_f.s3); /*K= lane_id + 24,N=2*/\
36     \
37     out_write_N2K4[1].s4 = convert_uchar_sat((float)outvec0.s3 * SCALE + bias_f.s0); /*K= lane_id,N=3*/\
38     out_write_N2K4[1].s5 = convert_uchar_sat((float)outvec1.s3 * SCALE + bias_f.s1); /*K= lane_id + 8,N=3*/\
39     out_write_N2K4[1].s6 = convert_uchar_sat((float)outvec2.s3 * SCALE + bias_f.s2); /*K= lane_id + 16,N=3*/\
40     out_write_N2K4[1].s7 = convert_uchar_sat((float)outvec3.s3 * SCALE + bias_f.s3); /*K= lane_id + 24,N=3*/
41
42 #elif NO_QUANTIZATION
43
44 #define QUANTIZATION \
45     out_write_N2K4[0].s0 = convert_uchar_sat(outvec0.s0); /*K= lane_id,N=0*/ \
46     out_write_N2K4[0].s1 = convert_uchar_sat(outvec1.s0); /*K= lane_id + 8,N=0*/\
47     out_write_N2K4[0].s2 = convert_uchar_sat(outvec2.s0); /*K= lane_id + 16,N=0*/\
48     out_write_N2K4[0].s3 = convert_uchar_sat(outvec3.s0); /*K= lane_id + 24,N=0*/\
49     \    
50     out_write_N2K4[0].s4 = convert_uchar_sat(outvec0.s1); /*K= lane_id,N=1*/\
51     out_write_N2K4[0].s5 = convert_uchar_sat(outvec1.s1); /*K= lane_id + 8,N=1*/\
52     out_write_N2K4[0].s6 = convert_uchar_sat(outvec2.s1); /*K= lane_id + 16,N=1*/\
53     out_write_N2K4[0].s7 = convert_uchar_sat(outvec3.s1); /*K= lane_id + 24,N=1*/\
54     \
55     out_write_N2K4[1].s0 = convert_uchar_sat(outvec0.s2); /*K= lane_id,N=2*/\
56     out_write_N2K4[1].s1 = convert_uchar_sat(outvec1.s2); /*K= lane_id + 8,N=2*/\
57     out_write_N2K4[1].s2 = convert_uchar_sat(outvec2.s2); /*K= lane_id + 16,N=2*/\
58     out_write_N2K4[1].s3 = convert_uchar_sat(outvec3.s2); /*K= lane_id + 24,N=2*/\
59     \
60     out_write_N2K4[1].s4 = convert_uchar_sat(outvec0.s3); /*K= lane_id,N=3*/\
61     out_write_N2K4[1].s5 = convert_uchar_sat(outvec1.s3); /*K= lane_id + 8,N=3*/\
62     out_write_N2K4[1].s6 = convert_uchar_sat(outvec2.s3); /*K= lane_id + 16,N=3*/\
63     out_write_N2K4[1].s7 = convert_uchar_sat(outvec3.s3); /*K= lane_id + 24,N=3*/
64
65 #else
66
67 #define QUANTIZATION \
68     out_write_N2K4[0].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=0*/ \
69     out_write_N2K4[0].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=0*/\
70     out_write_N2K4[0].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=0*/\
71     out_write_N2K4[0].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=0*/\
72     \    
73     out_write_N2K4[0].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=1*/\
74     out_write_N2K4[0].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=1*/\
75     out_write_N2K4[0].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=1*/\
76     out_write_N2K4[0].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=1*/\
77     \
78     out_write_N2K4[1].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=2*/\
79     out_write_N2K4[1].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=2*/\
80     out_write_N2K4[1].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=2*/\
81     out_write_N2K4[1].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=2*/\
82     \
83     out_write_N2K4[1].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=3*/\
84     out_write_N2K4[1].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=3*/\
85     out_write_N2K4[1].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=3*/\
86     out_write_N2K4[1].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=3*/
87
88 #endif
89
90 // mapping to clDNN
91 #define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C)
92 #define _OD OUTPUT_FEATURE_NUM
93 #define _OW OUTPUT_SIZE_X
94 #define _OH OUTPUT_SIZE_Y
95 #define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
96 #define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
97 #define _IH INPUT0_SIZE_Y
98 #define _IW INPUT0_SIZE_X
99 #define _ID INPUT0_FEATURE_NUM
100 #define K_HEIGHT FILTER_SIZE_Y
101 #define K_WIDTH FILTER_SIZE_X
102 #define BATCH_SIZE OUTPUT_BATCH_NUM
103
104 #define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
105 #define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
106 #define K_STRIDE STRIDE_SIZE_X
107 // end of mapping
108
109 // for now kernel stride is square
110 #define K_WSTRIDE K_STRIDE
111 #define K_HSTRIDE K_STRIDE
112
113 #define PACK 32
114 #define BATCH_PACK 4
115
116 __attribute__((intel_reqd_sub_group_size(8)))
117 KERNEL(convolution_mmad_slm_2x14_rep4)(
118 __global int8 *inputs,
119 __global uchar* outputs,
120 __global int8* weights,
121 #if BIAS_TERM
122     __global BIAS_TYPE* biases,
123 #endif
124 #if QUANTIZATION_TERM
125     const __global float* quantizations,
126 #endif
127 #if CALIBRATION_TERM
128     const __global float* calibrations,
129 #endif
130     uint split_idx
131 )
132 {
133         const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z;
134         const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y;
135
136         ushort fmg     = get_group_id(0);   // Output Depth
137         ushort group_y = get_group_id(1);   // Output Width
138         ushort group_z = get_group_id(2);   // Output Height
139
140         /* 16,1,8 WG , SIMD8 - 16 HW threads in a WG
141         threads 0-1 : ( lid_x:0-15,lid_y:0,lid_z:0)
142         threads 2-3 : ( lid_x:0-15,lid_y:0,lid_z:1)
143         ..
144         threads 12-13: ( lid_x:0-15, lid_y:0,lid_z:6)
145         threads 14-15: ( lid_x:0-15, lid_y:0,lid_z:7)
146         */
147
148         /* Thread, local IDs */
149         ushort thread_id                = get_sub_group_id();
150         ushort threadid_mod_2   = thread_id % 2;
151         ushort threadid_mod_8   = thread_id % 8;
152
153         ushort lid_x    = get_local_id(0);
154         ushort lid_z    = get_local_id(2);
155
156         uchar  lane_id  = get_sub_group_local_id();
157
158         /* 32-bit signed accumulator , 112 output registers for 1Px7Qx4Nx32K output tile size
159            Will be converted to 8-bits before final write */
160
161         int4  out_07 [ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]   = {0}; // For output channels 0-7
162         int4  out_815[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]   = {0}; // For output channels 8-15
163         int4  out_1623[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]  = {0}; // For output channels 16-23
164         int4  out_2431[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]  = {0}; // For output channels 24-31
165
166         /* Account for batching */
167
168         ushort batch    = ( fmg*LOCAL_SIZE_X*4 ) /_OD; // Each thread processing 32 output_channels and each fmg processing 64 output channels , LOCAL_SIZE_X is only 16
169
170         // Size calculated for int8 elements
171         uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ;
172
173         uint in_addr_offset = batch*input_size;
174
175         /* Goto activation tile for work group, offset is w.r.t int8 array */
176
177         uint groupy_tile = TILE_W*group_y;
178         uint groupz_tile = TILE_H*group_z;
179
180     in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK;
181
182                 /* SLM space for Activation, Weights
183                ( 16,1,8 ) Workgroup - 7 tiles along Y direction and 64 different output channels
184                     2 threads used to load global memory
185                 Activation - 9Hx9Wx4Nx32C Weights -3Rx3Sx64Kx32C        */
186
187         __local int8 act_slm      [  9*9*4 ];
188         __local int8 weight_slm   [  9*64  ];
189
190    /* 9Hx9Wx4Nx32C activation tile written into SLM.  Distribute among 14 threads in Workgroup
191            threads 0-1 write 9x4x32 of  H=0, W=0...8
192            threads 2-3 write 9x4x32 of H=1, W=0...8
193            threads 4-5 write 9x4x32 of H=2, W=0...8
194            threads 6-7  write 9x4x32 of H=3, W=0...8
195            threads 8-9 write 9x4x32 of H=4, W=0...8
196            threads 10-11 write 9x4x32 of H=5,W=0...8
197            threads 12-13 write 9x4x32 of H=6,W=0...8
198            threads 14 write 9x4x32 of H=7,W=0...8
199            threads 15 write 9x4x32 of H=8,W=0...8 */
200
201         /* Goto activation tile for thread in group */
202
203         uint row_offset   =  thread_id / 2;
204
205         if ( thread_id >= 14 )
206     {
207         row_offset = 7;
208         }
209
210         // In addr offset for the particular thread
211         in_addr_offset    += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ;
212
213    /* Activation SLM indices */
214     uint act_slm_write =  row_offset * ( TILE_W + 2) * BATCH_PACK;
215         uint act_slm_read  =  OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ;
216
217         /* 9RSx64Kx32C Weight Block in SLM
218            thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0,1 ( k=0..15 )
219            thread1 handles w(0,0),w(0,1),w(0,2) of K=2,3 ( k=16..31)
220            thread2 handles w(1,0),w(1,1) of K=0,1 ( k=0..15)
221            thread3 handles w(1,0),w(1,1) of K=2,3 ( k=16..31)
222            thread4 handles w(1,2),w(2,0) of K=0,1 ( k=0..15)
223            thread5 handles w(1,2),w(2,0) of K=2,3 ( k=16..31)
224            thread6 handles w(2,1),w(2,2) of K=0,1 ( k=0..15)
225            thread7 handles w(2,1),w(2,2) of K=2,3 ( k=16..31)
226
227            Similarly threads8-15 handles for K=4,5,6,7
228
229            Weight Layout in SLM
230
231            w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=32..39,C=0..15)
232            w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=32..39,C=16..31)
233
234            Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM
235            Thread0 will read k=0..31, thread1 will read k=32..63
236
237            First all output channels are present in SLM, then next weight pixel is present in SLM */
238
239          #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
240
241          uint output_depth    = fmg % ( _OD / ( LOCAL_SIZE_X * 4 ) ); //LOCAL_SIZE_X=16, 64 output channels used
242
243          uint weight_size_CRS =  ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside
244
245          // Global weight addr for workgroup
246          uint weight_global_addr_offset =  output_depth * 8 * weight_size_CRS ; //64 output channels per workgroup
247
248          /* Global weight address for thread */
249
250          // Goto appropriate output channel in weights
251          uint weight_global_channel_offset = threadid_mod_2 * 2 * weight_size_CRS ;
252
253         uint slm_channel_offset     = threadid_mod_2;
254         uint bc_fused_thread_offset = 0;
255
256          if ( thread_id >= 8 )
257     {
258                 bc_fused_thread_offset =  1;
259
260                 weight_global_channel_offset =  4 * weight_size_CRS + slm_channel_offset * weight_size_CRS * 2 ;
261     }
262
263          // Goto appropriate pixel in weights
264
265          uint weight_global_pixel_offset = 0;
266          uint slm_pixel_offset = 0;
267
268     if ( threadid_mod_8 >=2  )
269     {
270          /* First three pixels handled by threads 0-1, then 2 pixels handled by two threads */
271
272                 weight_global_pixel_offset = 3*8 +  ( ( (threadid_mod_8/2) - 1 )*2*8 );
273                 slm_pixel_offset                   = 3*64 + ( ( (threadid_mod_8/2) - 1 )*2*64 );
274     }
275
276     weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset;
277
278          /* Weight slm write index */
279
280          uint slm_write_weight = slm_pixel_offset + slm_channel_offset * 32 + bc_fused_thread_offset * 4;
281
282          /* Weight slm read index */
283
284          /* Thread 0  reads output channels 0-15, thread 1 handles output channels 16-31, data present in interleaved
285             manner in SLM
286                 Data layout in SLM
287
288                 w(0,0) C=0..7, K = 0..7 | w(0,0) C=0..7, K = 32..39
289                 w(0,0) C=8..15,K=0..7   | w(0,0) C=8..15,K = 32..39
290                 w(0,0) C=0..7, K=8..15  | w(0,0) C=0..7, K = 40..47
291                 w(0,0) C=8..15,K=8..15  | w(0,0) C=8..15,K=  40..47
292
293                 */
294     uint wt_slm_rd_offset = threadid_mod_2*4;
295
296         int kd;
297
298         __attribute__((opencl_unroll_hint(1)))
299         for(kd = 0; kd <  ( _ID / PACK ) ; kd++)
300         {
301                 {
302                         /* Load Activation from global to SLM */
303
304                         int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset;
305
306                         __global uint *activation_tile = (__global uint*)&inputs[ in_addr ];
307
308                         __local uint *act_slm_ptr   = (__local uint *) &act_slm [ act_slm_write  ];
309
310                         /* The odd thread in fused pair will start from next 4x8 block */
311
312                         activation_tile += threadid_mod_2*4*8;
313                         act_slm_ptr     += threadid_mod_2*4*8;
314
315                         int4 act_col_0 =  as_int4( intel_sub_group_block_read4(activation_tile) );//col 0
316                         int4 act_col_1 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );//col 2
317                         int4 act_col_2 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );//col 4
318                         int4 act_col_3 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );//col 6
319
320                         SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) );
321                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) );
322                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) );
323                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) );
324
325                         if ( threadid_mod_2  == 0 )
326             {
327                                 int4 act_col_4 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
328
329                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) );
330                         }
331
332                         if ( thread_id >=14)
333             {
334                                 activation_tile  = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8;
335                                 act_slm_ptr      = act_slm_ptr + (TILE_W + 2)  * BATCH_PACK *8;
336
337                                 int4 act_col_9 =  as_int4( intel_sub_group_block_read4(activation_tile) );
338                                 int4 act_col_10 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );
339                                 int4 act_col_11 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );
340                                 int4 act_col_12 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );
341
342                                 SLM_BLOCK_WRITE_4 ( act_slm_ptr  , as_uint4 ( act_col_9 ) );
343                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 )   , as_uint4 ( act_col_10 ) );
344                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) );
345                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) );
346
347                                 if ( threadid_mod_2  == 0 )
348                 {
349                                         int4 act_col_13 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
350
351                                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) );
352                                 }
353                         }
354
355                 /* load weights from global to weight_slm */
356
357                         int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset;
358
359                         __global uint *weight_tile   = (__global uint*)&weights    [ weight_addr ];
360                         __local  uint *wt_slm_ptr    = (__local uint *)&weight_slm [ slm_write_weight  ];
361
362                         __global uint *weight_tile_2   = weight_tile;
363                         __local uint *wt_slm_ptr_2     = wt_slm_ptr;
364
365                         int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );       // Pixel1 K=0..7 C=0..15
366                         int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=0..7 C=16..31
367                         int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=0..7 C=0..15
368                         int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=0..7 C=16..31
369
370                         // Goto next output channel
371                         weight_tile += weight_size_CRS*8;
372
373                         int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );       // Pixel1 K=8..15 C=0..15
374                         int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=8..15 C=16..31
375                         int4 w6 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=8..15 C=0..15
376                         int4 w7 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=8..15 C=16..31
377
378                         SLM_BLOCK_WRITE_4 ( wt_slm_ptr, as_uint4 ( w0 ) );
379                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) );
380                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ), as_uint4 ( w2 ) );
381                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ), as_uint4 ( w3 ) );
382
383                         wt_slm_ptr  += 16*8;
384
385                         SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) );
386                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 )   , as_uint4 ( w5 ) );
387                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ) , as_uint4 ( w6 ) );
388                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ) , as_uint4 ( w7 ) );
389
390                    if( threadid_mod_8 < 2 )
391            {
392                                 // Goto next pixel
393                                 weight_tile_2 += 16*8;
394                                 wt_slm_ptr_2  += 2*64*8;
395
396                                 int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) );     // Pixel1 K=0..7 C=0..15
397                                 int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) );       // Pixel1 K=0..7 C=16..31
398
399                                 // Goto next output channel
400                                 weight_tile_2 += weight_size_CRS*8;
401
402                                 int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) );     // Pixel1 K=8..15 C=0..15
403                                 int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) );       // Pixel1 K=8..15 C=16..31
404
405                                 SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2, as_uint4 ( w0 ) );
406                                 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w1 ) );
407
408                                 wt_slm_ptr_2  += 16*8;
409
410                                 SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2 , as_uint4 ( w4 ) );
411                                 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 )   , as_uint4 ( w5 ) );
412                         }
413         }
414
415                 // Synchronize SLM writes across workgroup
416                  barrier(CLK_LOCAL_MEM_FENCE);
417
418                 if ( lid_z <= 6 )
419         {
420                         uint wt_slm_rd = wt_slm_rd_offset;
421
422                         __local uint *slm_ptr0     = (__local uint *) &act_slm[ act_slm_read ];
423                         __local uint *slm_ptr1     = (__local uint *) &weight_slm[ wt_slm_rd ];
424
425                         /* balancing load of weights, activations   */
426                         int8 weights_reg[3]; //24 registers
427                         int4 act_reg[18];    //72 registers
428                         uint slm_read_pixel_offset = 64*8;
429
430                         /**********************************************************************************************************
431                           First phase - multiply first row of weights  and 1st row of activations
432                         ***********************************************************************************************************/
433
434                          /* Load weights from SLM into registers - row0, output channels 0..7  */
435
436                                 {
437                                                 __local uint *slm_ptrw0  = slm_ptr1;
438
439                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
440                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
441                                                 slm_ptrw0                        += slm_read_pixel_offset;
442
443                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
444                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
445                                                 slm_ptrw0                        += slm_read_pixel_offset;
446
447                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
448                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
449                                 }
450
451                         /* load 1Hx9Wx4N inputs, Activation row0   */
452
453                                 __attribute__((opencl_unroll_hint(9)))
454                                 for (int ic = 0; ic < 9; ic++)
455                                 {
456                          /* Load activations from SLM into registers  */
457
458                                          uint slm_offset = ic * BATCH_PACK * 8 ;
459
460                                  act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
461                                 }
462
463                         /* Convolve */
464
465                            /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/
466
467                                 /*  Output channels 0-7 */
468
469                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
470                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
471                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
472                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
473                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
474                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
475                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
476
477                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
478                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
479                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
480                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
481                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
482                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
483                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
484
485                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
486                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
487                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
488                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
489                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
490                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
491                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
492
493                      /* Load weights from SLM into registers - row0, output channels 8..15  */
494
495                                 {
496                                                 __local uint *slm_ptrw0 = slm_ptr1 + 2*8*8;
497
498                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
499                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
500                                                 slm_ptrw0                        += slm_read_pixel_offset;
501
502                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
503                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
504                                                 slm_ptrw0                        += slm_read_pixel_offset;
505
506                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
507                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
508                                 }
509
510                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
511                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
512                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
513                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
514                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
515                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
516                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
517
518                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
519                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
520                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
521                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
522                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
523                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
524                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
525
526                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
527                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
528                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
529                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
530                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
531                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
532                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
533
534                                 /* Load weights from SLM into registers - row0, output channels 16..23  */
535                                 {
536                                                 __local uint *slm_ptrw0 = slm_ptr1 + 4*8*8;
537
538                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
539                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
540                                                 slm_ptrw0                        += slm_read_pixel_offset;
541
542                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
543                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
544                                                 slm_ptrw0                        += slm_read_pixel_offset;
545
546                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
547                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
548                                 }
549
550                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
551                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
552                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
553                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
554                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
555                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
556                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
557
558                                 /* load 1Hx9Wx4N inputs, Activation row1   */
559
560                                 uint slm_row_offset_2     = 1*(TILE_W + 2)*BATCH_PACK*8;
561
562                                 __attribute__((opencl_unroll_hint(9)))
563                                 for (int ic = 0; ic < 9; ic++)
564                                 {
565                          /* Load activations from SLM into registers  */
566
567                                          uint slm_offset = slm_row_offset_2 + ic * BATCH_PACK * 8 ;
568
569                                  act_reg [ ic + 9 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
570                                 }
571
572                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
573                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
574                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
575                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
576                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
577                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
578                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
579
580                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
581                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
582                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
583                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
584                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
585                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
586                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
587
588                                 /* Load weights from SLM into registers - row0, output channels 24..31  */
589                                 {
590                                                 __local uint *slm_ptrw0 = slm_ptr1 + 6*8*8;
591
592                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
593                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
594                                                 slm_ptrw0                        += slm_read_pixel_offset;
595
596                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
597                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
598                                                 slm_ptrw0                        += slm_read_pixel_offset;
599
600                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
601                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
602                                 }
603
604                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
605                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
606                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
607                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
608                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
609                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
610                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
611
612                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
613                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
614                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
615                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
616                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
617                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
618                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
619
620                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
621                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
622                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
623                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
624                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
625                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
626                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
627
628                         /**********************************************************************************************************
629                           Second phase - multiply second row of weights  and second row of activations
630                         ***********************************************************************************************************/
631
632                          /* Load weights from SLM into registers - row1, output channels 0..7  */
633                                 {
634                                                 __local uint *slm_ptrw1  = slm_ptr1 + 3*slm_read_pixel_offset;
635
636                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
637                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
638                                                 slm_ptrw1                        += slm_read_pixel_offset;
639
640                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
641                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
642                                                 slm_ptrw1                            += slm_read_pixel_offset;
643
644                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
645                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
646                                 }
647
648                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[9], weights_reg[0] );
649                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[10], weights_reg[0] );
650                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[11], weights_reg[0] );
651                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[12], weights_reg[0] );
652                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[13], weights_reg[0] );
653                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[14], weights_reg[0] );
654                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[15], weights_reg[0] );
655
656                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[10], weights_reg[1] );
657                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[11], weights_reg[1] );
658                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[12], weights_reg[1] );
659                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[13], weights_reg[1] );
660                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[14], weights_reg[1] );
661                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[15], weights_reg[1] );
662                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[16], weights_reg[1] );
663
664                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[11], weights_reg[2] );
665                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[12], weights_reg[2] );
666                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[13], weights_reg[2] );
667                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[14], weights_reg[2] );
668                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[15], weights_reg[2] );
669                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[16], weights_reg[2] );
670                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[17], weights_reg[2] );
671
672                                     /* Load weights from SLM into registers - row1, output channels 8..15  */
673                                 {
674                                                 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 2*8*8;
675
676                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
677                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
678                                                 slm_ptrw1                          += slm_read_pixel_offset;
679
680                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
681                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
682                                                 slm_ptrw1                          += slm_read_pixel_offset;
683
684                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
685                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
686                                 }
687
688                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[9], weights_reg[0] );
689                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[10], weights_reg[0] );
690                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[11], weights_reg[0] );
691                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[12], weights_reg[0] );
692                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[13], weights_reg[0] );
693                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[14], weights_reg[0] );
694                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[15], weights_reg[0] );
695
696                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[10], weights_reg[1] );
697                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[11], weights_reg[1] );
698                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[12], weights_reg[1] );
699                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[13], weights_reg[1] );
700                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[14], weights_reg[1] );
701                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[15], weights_reg[1] );
702                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[16], weights_reg[1] );
703
704                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[11], weights_reg[2] );
705                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[12], weights_reg[2] );
706                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[13], weights_reg[2] );
707                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[14], weights_reg[2] );
708                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[15], weights_reg[2] );
709                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[16], weights_reg[2] );
710                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[17], weights_reg[2] );
711
712                                 /* Load weights from SLM into registers - row1, output channels 16..23  */
713                                 {
714                                                 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 4*8*8;
715
716                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
717                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
718                                                 slm_ptrw1                          += slm_read_pixel_offset;
719
720                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
721                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
722                                                 slm_ptrw1                          += slm_read_pixel_offset;
723
724                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
725                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
726                                 }
727
728                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[9], weights_reg[0] );
729                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[10], weights_reg[0] );
730                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[11], weights_reg[0] );
731                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[12], weights_reg[0] );
732                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[13], weights_reg[0] );
733                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[14], weights_reg[0] );
734                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[15], weights_reg[0] );
735
736                                 /* load 1Hx9Wx4N inputs, Activation row2  */
737
738                                 uint slm_row_offset_3     = 2*(TILE_W + 2)*BATCH_PACK*8;
739
740                                 __attribute__((opencl_unroll_hint(9)))
741                                 for (int ic = 0; ic < 9; ic++)
742                                 {
743                          /* Load activations from SLM into registers  */
744
745                                          uint slm_offset = slm_row_offset_3 + ic * BATCH_PACK * 8 ;
746
747                                  act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
748                                 }
749
750                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[10], weights_reg[1] );
751                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[11], weights_reg[1] );
752                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[12], weights_reg[1] );
753                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[13], weights_reg[1] );
754                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[14], weights_reg[1] );
755                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[15], weights_reg[1] );
756                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[16], weights_reg[1] );
757
758                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[11], weights_reg[2] );
759                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[12], weights_reg[2] );
760                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[13], weights_reg[2] );
761                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[14], weights_reg[2] );
762                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[15], weights_reg[2] );
763                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[16], weights_reg[2] );
764                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[17], weights_reg[2] );
765
766                                 /* Load weights from SLM into registers - row1, output channels 24..31  */
767                                 {
768                                                 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 6*8*8;
769
770                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
771                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
772                                                 slm_ptrw1                          += slm_read_pixel_offset;
773
774                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
775                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
776                                                 slm_ptrw1                          += slm_read_pixel_offset;
777
778                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
779                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
780                                 }
781
782                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[9], weights_reg[0] );
783                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[10], weights_reg[0] );
784                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[11], weights_reg[0] );
785                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[12], weights_reg[0] );
786                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[13], weights_reg[0] );
787                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[14], weights_reg[0] );
788                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[15], weights_reg[0] );
789
790                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[10], weights_reg[1] );
791                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[11], weights_reg[1] );
792                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[12], weights_reg[1] );
793                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[13], weights_reg[1] );
794                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[14], weights_reg[1] );
795                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[15], weights_reg[1] );
796                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[16], weights_reg[1] );
797
798                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[11], weights_reg[2] );
799                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[12], weights_reg[2] );
800                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[13], weights_reg[2] );
801                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[14], weights_reg[2] );
802                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[15], weights_reg[2] );
803                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[16], weights_reg[2] );
804                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[17], weights_reg[2] );
805
806                         /**********************************************************************************************************
807                           Third phase - multiply third row of weights  and third row of activations
808                         ***********************************************************************************************************/
809
810                                  /* Load weights from SLM into registers - row2, output channels 0..7  */
811                                 {
812                                                 __local uint *slm_ptrw2  = slm_ptr1 + 6*slm_read_pixel_offset;
813
814                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
815                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
816                                                 slm_ptrw2                          += slm_read_pixel_offset;
817
818                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
819                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
820                                                 slm_ptrw2                            += slm_read_pixel_offset;
821
822                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
823                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
824                                 }
825
826                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
827                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
828                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
829                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
830                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
831                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
832                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
833
834                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
835                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
836                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
837                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
838                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
839                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
840                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
841
842                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
843                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
844                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
845                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
846                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
847                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
848                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
849
850                                      /* Load weights from SLM into registers - row2, output channels 8..15  */
851                                 {
852                                                 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 2*8*8;
853
854                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
855                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
856                                                 slm_ptrw2                          += slm_read_pixel_offset;
857
858                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
859                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
860                                                 slm_ptrw2                          += slm_read_pixel_offset;
861
862                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
863                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
864                                 }
865
866                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
867                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
868                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
869                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
870                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
871                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
872                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
873
874                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
875                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
876                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
877                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
878                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
879                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
880                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
881
882                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
883                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
884                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
885                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
886                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
887                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
888                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
889
890                                 /* Load weights from SLM into registers - row2, output channels 16..23  */
891                                 {
892                                                 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 4*8*8;
893
894                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
895                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
896                                                 slm_ptrw2                          += slm_read_pixel_offset;
897
898                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
899                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
900                                                 slm_ptrw2                          += slm_read_pixel_offset;
901
902                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
903                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
904                                 }
905
906                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
907                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
908                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
909                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
910                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
911                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
912                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
913
914                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
915                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
916                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
917                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
918                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
919                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
920                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
921
922                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
923                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
924                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
925                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
926                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
927                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
928                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
929
930                                 /* Load weights from SLM into registers - row3, output channels 24..31  */
931                                 {
932                                                 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 6*8*8;
933
934                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
935                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
936                                                 slm_ptrw2                          += slm_read_pixel_offset;
937
938                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
939                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
940                                                 slm_ptrw2                          += slm_read_pixel_offset;
941
942                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
943                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
944                                 }
945
946                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
947                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
948                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
949                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
950                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
951                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
952                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
953
954                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
955                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
956                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
957                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
958                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
959                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
960                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
961
962                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
963                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
964                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
965                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
966                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
967                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
968                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
969                 }
970
971                         // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM
972                         barrier(CLK_LOCAL_MEM_FENCE);
973         } //for kd
974
975         /****************************************************************************************************************
976                 *******************************Output Write Stage****************************************************************
977                 ****************************************************************************************************************/
978                         /*
979                    Outputs will be passed through activation function and quantized to 8 bits before writing
980                    Output will be in same format as input [K/32][N/4][P][Q][4N][32K] */
981
982                         /******************* Write output to SLM *************************************/
983
984                 /*  Quantize and pack 4x1 byte - from consectuive n-coordinates
985                         Each thread produces [1P][7Q][4N][32K]
986                 Write uint32 from each lane to SLM , the entire thread will write 32-consecutive K-coorindates
987
988                         Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK )
989                         In SLM 7x7x4x32 present first then the next 32 channels
990                 */
991
992                 if( lid_z <= 6 )
993         {
994                         /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */
995                         uint row_size_bytes        = (_OW + OWPAD) * PACK * BATCH_PACK;
996
997                         /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */
998                         uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD);
999
1000                         /* Each output_depth WG writes 64 output channels */
1001
1002                         uint output_depth_index      =  output_depth*2 + threadid_mod_2;
1003                         uint batch_index                         =  batch;
1004
1005                         /* Each WG produces entire 7x7 output, hence no group_y, group_z tiling */
1006
1007             uint output_offset_x = groupy_tile * OUT_X_PITCH;
1008             uint output_offset_y = groupz_tile * OUT_Y_PITCH;
1009                         uint slice_pack_addr_bytes  = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + lid_z * row_size_bytes;
1010                                                 
1011                         __global uchar* output_write_ptr = (__global uchar *) &outputs [ slice_pack_addr_bytes + output_offset_x + output_offset_y ];
1012
1013                 const uint feature = output_depth_index * 32 + get_sub_group_local_id();
1014
1015                 const float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
1016                 const float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
1017                 const float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
1018
1019                 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
1020                                 for (int col = 0; col < OUT_BLOCK_WIDTH; col++)
1021                 {
1022
1023                                         int4 outvec0 = out_07[col];
1024                                         int4 outvec1 = out_815[col];
1025                                         int4 outvec2 = out_1623[col];
1026                                         int4 outvec3 = out_2431[col];
1027
1028                                         /* Non-Linear Activation & Quantization code */
1029
1030                                         uchar8 out_write_N2K4[2];
1031
1032                     QUANTIZATION;
1033
1034                                         intel_sub_group_block_write_uc8 (  output_write_ptr  , out_write_N2K4[0] );
1035                                         output_write_ptr += 64;
1036                                         intel_sub_group_block_write_uc8 (  output_write_ptr  , out_write_N2K4[1] );
1037                                         output_write_ptr += 64;
1038
1039                                 } // out_block_width-for loop
1040                 }//lid_z loop
1041 } //end of kernel
1042
1043 #undef SCAL
1044 #undef QUANTIZATION