inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/mmad.cl"
  16
  17 #define SCALE 0.11f
  18
  19 #ifdef LIGHTWEIGHT_QUANTIZATION
  20
  21 #define QUANTIZATION \
  22     out_write_N2K4[0].s0 = convert_uchar_sat((float)outvec0.s0 * SCALE + bias_f.s0); /*K= lane_id,N=0*/ \
  23     out_write_N2K4[0].s1 = convert_uchar_sat((float)outvec1.s0 * SCALE + bias_f.s1); /*K= lane_id + 8,N=0*/\
  24     out_write_N2K4[0].s2 = convert_uchar_sat((float)outvec2.s0 * SCALE + bias_f.s2); /*K= lane_id + 16,N=0*/\
  25     out_write_N2K4[0].s3 = convert_uchar_sat((float)outvec3.s0 * SCALE + bias_f.s3); /*K= lane_id + 24,N=0*/\
  26     \
  27     out_write_N2K4[0].s4 = convert_uchar_sat((float)outvec0.s1 * SCALE + bias_f.s0); /*K= lane_id,N=1*/\
  28     out_write_N2K4[0].s5 = convert_uchar_sat((float)outvec1.s1 * SCALE + bias_f.s1); /*K= lane_id + 8,N=1*/\
  29     out_write_N2K4[0].s6 = convert_uchar_sat((float)outvec2.s1 * SCALE + bias_f.s2); /*K= lane_id + 16,N=1*/\
  30     out_write_N2K4[0].s7 = convert_uchar_sat((float)outvec3.s1 * SCALE + bias_f.s3); /*K= lane_id + 24,N=1*/\
  31     \
  32     out_write_N2K4[1].s0 = convert_uchar_sat((float)outvec0.s2 * SCALE + bias_f.s0); /*K= lane_id,N=2*/\
  33     out_write_N2K4[1].s1 = convert_uchar_sat((float)outvec1.s2 * SCALE + bias_f.s1); /*K= lane_id + 8,N=2*/\
  34     out_write_N2K4[1].s2 = convert_uchar_sat((float)outvec2.s2 * SCALE + bias_f.s2); /*K= lane_id + 16,N=2*/\
  35     out_write_N2K4[1].s3 = convert_uchar_sat((float)outvec3.s2 * SCALE + bias_f.s3); /*K= lane_id + 24,N=2*/\
  36     \
  37     out_write_N2K4[1].s4 = convert_uchar_sat((float)outvec0.s3 * SCALE + bias_f.s0); /*K= lane_id,N=3*/\
  38     out_write_N2K4[1].s5 = convert_uchar_sat((float)outvec1.s3 * SCALE + bias_f.s1); /*K= lane_id + 8,N=3*/\
  39     out_write_N2K4[1].s6 = convert_uchar_sat((float)outvec2.s3 * SCALE + bias_f.s2); /*K= lane_id + 16,N=3*/\
  40     out_write_N2K4[1].s7 = convert_uchar_sat((float)outvec3.s3 * SCALE + bias_f.s3); /*K= lane_id + 24,N=3*/
  41
  42 #elif NO_QUANTIZATION
  43
  44 #define QUANTIZATION \
  45     out_write_N2K4[0].s0 = convert_uchar_sat(outvec0.s0); /*K= lane_id,N=0*/ \
  46     out_write_N2K4[0].s1 = convert_uchar_sat(outvec1.s0); /*K= lane_id + 8,N=0*/\
  47     out_write_N2K4[0].s2 = convert_uchar_sat(outvec2.s0); /*K= lane_id + 16,N=0*/\
  48     out_write_N2K4[0].s3 = convert_uchar_sat(outvec3.s0); /*K= lane_id + 24,N=0*/\
  49     \
  50     out_write_N2K4[0].s4 = convert_uchar_sat(outvec0.s1); /*K= lane_id,N=1*/\
  51     out_write_N2K4[0].s5 = convert_uchar_sat(outvec1.s1); /*K= lane_id + 8,N=1*/\
  52     out_write_N2K4[0].s6 = convert_uchar_sat(outvec2.s1); /*K= lane_id + 16,N=1*/\
  53     out_write_N2K4[0].s7 = convert_uchar_sat(outvec3.s1); /*K= lane_id + 24,N=1*/\
  54     \
  55     out_write_N2K4[1].s0 = convert_uchar_sat(outvec0.s2); /*K= lane_id,N=2*/\
  56     out_write_N2K4[1].s1 = convert_uchar_sat(outvec1.s2); /*K= lane_id + 8,N=2*/\
  57     out_write_N2K4[1].s2 = convert_uchar_sat(outvec2.s2); /*K= lane_id + 16,N=2*/\
  58     out_write_N2K4[1].s3 = convert_uchar_sat(outvec3.s2); /*K= lane_id + 24,N=2*/\
  59     \
  60     out_write_N2K4[1].s4 = convert_uchar_sat(outvec0.s3); /*K= lane_id,N=3*/\
  61     out_write_N2K4[1].s5 = convert_uchar_sat(outvec1.s3); /*K= lane_id + 8,N=3*/\
  62     out_write_N2K4[1].s6 = convert_uchar_sat(outvec2.s3); /*K= lane_id + 16,N=3*/\
  63     out_write_N2K4[1].s7 = convert_uchar_sat(outvec3.s3); /*K= lane_id + 24,N=3*/
  64
  65 #else
  66
  67 #define QUANTIZATION \
  68     out_write_N2K4[0].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=0*/ \
  69     out_write_N2K4[0].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=0*/\
  70     out_write_N2K4[0].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=0*/\
  71     out_write_N2K4[0].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=0*/\
  72     \
  73     out_write_N2K4[0].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=1*/\
  74     out_write_N2K4[0].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=1*/\
  75     out_write_N2K4[0].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=1*/\
  76     out_write_N2K4[0].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=1*/\
  77     \
  78     out_write_N2K4[1].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=2*/\
  79     out_write_N2K4[1].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=2*/\
  80     out_write_N2K4[1].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=2*/\
  81     out_write_N2K4[1].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=2*/\
  82     \
  83     out_write_N2K4[1].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=3*/\
  84     out_write_N2K4[1].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=3*/\
  85     out_write_N2K4[1].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=3*/\
  86     out_write_N2K4[1].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=3*/
  87
  88 #endif
  89
  90 // mapping to clDNN
  91 #define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C)
  92 #define _OD OUTPUT_FEATURE_NUM
  93 #define _OW OUTPUT_SIZE_X
  94 #define _OH OUTPUT_SIZE_Y
  95 #define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
  96 #define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
  97 #define _IH INPUT0_SIZE_Y
  98 #define _IW INPUT0_SIZE_X
  99 #define _ID INPUT0_FEATURE_NUM
 100 #define K_HEIGHT FILTER_SIZE_Y
 101 #define K_WIDTH FILTER_SIZE_X
 102 #define BATCH_SIZE OUTPUT_BATCH_NUM
 103
 104 #define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
 105 #define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 106 #define K_STRIDE STRIDE_SIZE_X
 107 // end of mapping
 108
 109 // for now kernel stride is square
 110 #define K_WSTRIDE K_STRIDE
 111 #define K_HSTRIDE K_STRIDE
 112
 113 #define PACK 32
 114 #define BATCH_PACK 4
 115
 116 __attribute__((intel_reqd_sub_group_size(8)))
 117 KERNEL(convolution_mmad_slm_2x14_rep4)(
 118 __global int8 *inputs,
 119 __global uchar* outputs,
 120 __global int8* weights,
 121 #if BIAS_TERM
 122     __global BIAS_TYPE* biases,
 123 #endif
 124 #if QUANTIZATION_TERM
 125     const __global float* quantizations,
 126 #endif
 127 #if CALIBRATION_TERM
 128     const __global float* calibrations,
 129 #endif
 130     uint split_idx
 131 )
 132 {
 133         const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z;
 134         const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y;
 135
 136         ushort fmg     = get_group_id(0);   // Output Depth
 137         ushort group_y = get_group_id(1);   // Output Width
 138         ushort group_z = get_group_id(2);   // Output Height
 139
 140         /* 16,1,8 WG , SIMD8 - 16 HW threads in a WG
 141         threads 0-1 : ( lid_x:0-15,lid_y:0,lid_z:0)
 142         threads 2-3 : ( lid_x:0-15,lid_y:0,lid_z:1)
 143         ..
 144         threads 12-13: ( lid_x:0-15, lid_y:0,lid_z:6)
 145         threads 14-15: ( lid_x:0-15, lid_y:0,lid_z:7)
 146         */
 147
 148         /* Thread, local IDs */
 149         ushort thread_id                = get_sub_group_id();
 150         ushort threadid_mod_2   = thread_id % 2;
 151         ushort threadid_mod_8   = thread_id % 8;
 152
 153         ushort lid_x    = get_local_id(0);
 154         ushort lid_z    = get_local_id(2);
 155
 156         uchar  lane_id  = get_sub_group_local_id();
 157
 158         /* 32-bit signed accumulator , 112 output registers for 1Px7Qx4Nx32K output tile size
 159            Will be converted to 8-bits before final write */
 160
 161         int4  out_07 [ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]   = {0}; // For output channels 0-7
 162         int4  out_815[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]   = {0}; // For output channels 8-15
 163         int4  out_1623[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]  = {0}; // For output channels 16-23
 164         int4  out_2431[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]  = {0}; // For output channels 24-31
 165
 166         /* Account for batching */
 167
 168         ushort batch    = ( fmg*LOCAL_SIZE_X*4 ) /_OD; // Each thread processing 32 output_channels and each fmg processing 64 output channels , LOCAL_SIZE_X is only 16
 169
 170         // Size calculated for int8 elements
 171         uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ;
 172
 173         uint in_addr_offset = batch*input_size;
 174
 175         /* Goto activation tile for work group, offset is w.r.t int8 array */
 176
 177         uint groupy_tile = TILE_W*group_y;
 178         uint groupz_tile = TILE_H*group_z;
 179
 180     in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK;
 181
 182                 /* SLM space for Activation, Weights
 183                ( 16,1,8 ) Workgroup - 7 tiles along Y direction and 64 different output channels
 184                     2 threads used to load global memory
 185                 Activation - 9Hx9Wx4Nx32C Weights -3Rx3Sx64Kx32C        */
 186
 187         __local int8 act_slm      [  9*9*4 ];
 188         __local int8 weight_slm   [  9*64  ];
 189
 190    /* 9Hx9Wx4Nx32C activation tile written into SLM.  Distribute among 14 threads in Workgroup
 191            threads 0-1 write 9x4x32 of  H=0, W=0...8
 192            threads 2-3 write 9x4x32 of H=1, W=0...8
 193            threads 4-5 write 9x4x32 of H=2, W=0...8
 194            threads 6-7  write 9x4x32 of H=3, W=0...8
 195            threads 8-9 write 9x4x32 of H=4, W=0...8
 196            threads 10-11 write 9x4x32 of H=5,W=0...8
 197            threads 12-13 write 9x4x32 of H=6,W=0...8
 198            threads 14 write 9x4x32 of H=7,W=0...8
 199            threads 15 write 9x4x32 of H=8,W=0...8 */
 200
 201         /* Goto activation tile for thread in group */
 202
 203         uint row_offset   =  thread_id / 2;
 204
 205         if ( thread_id >= 14 )
 206     {
 207         row_offset = 7;
 208         }
 209
 210         // In addr offset for the particular thread
 211         in_addr_offset    += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ;
 212
 213    /* Activation SLM indices */
 214     uint act_slm_write =  row_offset * ( TILE_W + 2) * BATCH_PACK;
 215         uint act_slm_read  =  OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ;
 216
 217         /* 9RSx64Kx32C Weight Block in SLM
 218            thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0,1 ( k=0..15 )
 219            thread1 handles w(0,0),w(0,1),w(0,2) of K=2,3 ( k=16..31)
 220            thread2 handles w(1,0),w(1,1) of K=0,1 ( k=0..15)
 221            thread3 handles w(1,0),w(1,1) of K=2,3 ( k=16..31)
 222            thread4 handles w(1,2),w(2,0) of K=0,1 ( k=0..15)
 223            thread5 handles w(1,2),w(2,0) of K=2,3 ( k=16..31)
 224            thread6 handles w(2,1),w(2,2) of K=0,1 ( k=0..15)
 225            thread7 handles w(2,1),w(2,2) of K=2,3 ( k=16..31)
 226
 227            Similarly threads8-15 handles for K=4,5,6,7
 228
 229            Weight Layout in SLM
 230
 231            w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=32..39,C=0..15)
 232            w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=32..39,C=16..31)
 233
 234            Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM
 235            Thread0 will read k=0..31, thread1 will read k=32..63
 236
 237            First all output channels are present in SLM, then next weight pixel is present in SLM */
 238
 239          #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
 240
 241          uint output_depth    = fmg % ( _OD / ( LOCAL_SIZE_X * 4 ) ); //LOCAL_SIZE_X=16, 64 output channels used
 242
 243          uint weight_size_CRS =  ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside
 244
 245          // Global weight addr for workgroup
 246          uint weight_global_addr_offset =  output_depth * 8 * weight_size_CRS ; //64 output channels per workgroup
 247
 248          /* Global weight address for thread */
 249
 250          // Goto appropriate output channel in weights
 251          uint weight_global_channel_offset = threadid_mod_2 * 2 * weight_size_CRS ;
 252
 253         uint slm_channel_offset     = threadid_mod_2;
 254         uint bc_fused_thread_offset = 0;
 255
 256          if ( thread_id >= 8 )
 257     {
 258                 bc_fused_thread_offset =  1;
 259
 260                 weight_global_channel_offset =  4 * weight_size_CRS + slm_channel_offset * weight_size_CRS * 2 ;
 261     }
 262
 263          // Goto appropriate pixel in weights
 264
 265          uint weight_global_pixel_offset = 0;
 266          uint slm_pixel_offset = 0;
 267
 268     if ( threadid_mod_8 >=2  )
 269     {
 270          /* First three pixels handled by threads 0-1, then 2 pixels handled by two threads */
 271
 272                 weight_global_pixel_offset = 3*8 +  ( ( (threadid_mod_8/2) - 1 )*2*8 );
 273                 slm_pixel_offset                   = 3*64 + ( ( (threadid_mod_8/2) - 1 )*2*64 );
 274     }
 275
 276     weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset;
 277
 278          /* Weight slm write index */
 279
 280          uint slm_write_weight = slm_pixel_offset + slm_channel_offset * 32 + bc_fused_thread_offset * 4;
 281
 282          /* Weight slm read index */
 283
 284          /* Thread 0  reads output channels 0-15, thread 1 handles output channels 16-31, data present in interleaved
 285             manner in SLM
 286                 Data layout in SLM
 287
 288                 w(0,0) C=0..7, K = 0..7 | w(0,0) C=0..7, K = 32..39
 289                 w(0,0) C=8..15,K=0..7   | w(0,0) C=8..15,K = 32..39
 290                 w(0,0) C=0..7, K=8..15  | w(0,0) C=0..7, K = 40..47
 291                 w(0,0) C=8..15,K=8..15  | w(0,0) C=8..15,K=  40..47
 292
 293                 */
 294     uint wt_slm_rd_offset = threadid_mod_2*4;
 295
 296         int kd;
 297
 298         __attribute__((opencl_unroll_hint(1)))
 299         for(kd = 0; kd <  ( _ID / PACK ) ; kd++)
 300         {
 301                 {
 302                         /* Load Activation from global to SLM */
 303
 304                         int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset;
 305
 306                         __global uint *activation_tile = (__global uint*)&inputs[ in_addr ];
 307
 308                         __local uint *act_slm_ptr   = (__local uint *) &act_slm [ act_slm_write  ];
 309
 310                         /* The odd thread in fused pair will start from next 4x8 block */
 311
 312                         activation_tile += threadid_mod_2*4*8;
 313                         act_slm_ptr     += threadid_mod_2*4*8;
 314
 315                         int4 act_col_0 =  as_int4( intel_sub_group_block_read4(activation_tile) );//col 0
 316                         int4 act_col_1 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );//col 2
 317                         int4 act_col_2 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );//col 4
 318                         int4 act_col_3 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );//col 6
 319
 320                         SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) );
 321                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) );
 322                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) );
 323                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) );
 324
 325                         if ( threadid_mod_2  == 0 )
 326             {
 327                                 int4 act_col_4 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
 328
 329                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) );
 330                         }
 331
 332                         if ( thread_id >=14)
 333             {
 334                                 activation_tile  = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8;
 335                                 act_slm_ptr      = act_slm_ptr + (TILE_W + 2)  * BATCH_PACK *8;
 336
 337                                 int4 act_col_9 =  as_int4( intel_sub_group_block_read4(activation_tile) );
 338                                 int4 act_col_10 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );
 339                                 int4 act_col_11 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );
 340                                 int4 act_col_12 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );
 341
 342                                 SLM_BLOCK_WRITE_4 ( act_slm_ptr  , as_uint4 ( act_col_9 ) );
 343                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 )   , as_uint4 ( act_col_10 ) );
 344                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) );
 345                                 SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) );
 346
 347                                 if ( threadid_mod_2  == 0 )
 348                 {
 349                                         int4 act_col_13 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
 350
 351                                         SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) );
 352                                 }
 353                         }
 354
 355                 /* load weights from global to weight_slm */
 356
 357                         int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset;
 358
 359                         __global uint *weight_tile   = (__global uint*)&weights    [ weight_addr ];
 360                         __local  uint *wt_slm_ptr    = (__local uint *)&weight_slm [ slm_write_weight  ];
 361
 362                         __global uint *weight_tile_2   = weight_tile;
 363                         __local uint *wt_slm_ptr_2     = wt_slm_ptr;
 364
 365                         int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );       // Pixel1 K=0..7 C=0..15
 366                         int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=0..7 C=16..31
 367                         int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=0..7 C=0..15
 368                         int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=0..7 C=16..31
 369
 370                         // Goto next output channel
 371                         weight_tile += weight_size_CRS*8;
 372
 373                         int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );       // Pixel1 K=8..15 C=0..15
 374                         int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=8..15 C=16..31
 375                         int4 w6 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=8..15 C=0..15
 376                         int4 w7 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=8..15 C=16..31
 377
 378                         SLM_BLOCK_WRITE_4 ( wt_slm_ptr, as_uint4 ( w0 ) );
 379                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) );
 380                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ), as_uint4 ( w2 ) );
 381                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ), as_uint4 ( w3 ) );
 382
 383                         wt_slm_ptr  += 16*8;
 384
 385                         SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) );
 386                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 )   , as_uint4 ( w5 ) );
 387                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ) , as_uint4 ( w6 ) );
 388                         SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ) , as_uint4 ( w7 ) );
 389
 390                    if( threadid_mod_8 < 2 )
 391            {
 392                                 // Goto next pixel
 393                                 weight_tile_2 += 16*8;
 394                                 wt_slm_ptr_2  += 2*64*8;
 395
 396                                 int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) );     // Pixel1 K=0..7 C=0..15
 397                                 int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) );       // Pixel1 K=0..7 C=16..31
 398
 399                                 // Goto next output channel
 400                                 weight_tile_2 += weight_size_CRS*8;
 401
 402                                 int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) );     // Pixel1 K=8..15 C=0..15
 403                                 int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) );       // Pixel1 K=8..15 C=16..31
 404
 405                                 SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2, as_uint4 ( w0 ) );
 406                                 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w1 ) );
 407
 408                                 wt_slm_ptr_2  += 16*8;
 409
 410                                 SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2 , as_uint4 ( w4 ) );
 411                                 SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 )   , as_uint4 ( w5 ) );
 412                         }
 413         }
 414
 415                 // Synchronize SLM writes across workgroup
 416                  barrier(CLK_LOCAL_MEM_FENCE);
 417
 418                 if ( lid_z <= 6 )
 419         {
 420                         uint wt_slm_rd = wt_slm_rd_offset;
 421
 422                         __local uint *slm_ptr0     = (__local uint *) &act_slm[ act_slm_read ];
 423                         __local uint *slm_ptr1     = (__local uint *) &weight_slm[ wt_slm_rd ];
 424
 425                         /* balancing load of weights, activations   */
 426                         int8 weights_reg[3]; //24 registers
 427                         int4 act_reg[18];    //72 registers
 428                         uint slm_read_pixel_offset = 64*8;
 429
 430                         /**********************************************************************************************************
 431                           First phase - multiply first row of weights  and 1st row of activations
 432                         ***********************************************************************************************************/
 433
 434                          /* Load weights from SLM into registers - row0, output channels 0..7  */
 435
 436                                 {
 437                                                 __local uint *slm_ptrw0  = slm_ptr1;
 438
 439                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 440                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 441                                                 slm_ptrw0                        += slm_read_pixel_offset;
 442
 443                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 444                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 445                                                 slm_ptrw0                        += slm_read_pixel_offset;
 446
 447                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 448                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 449                                 }
 450
 451                         /* load 1Hx9Wx4N inputs, Activation row0   */
 452
 453                                 __attribute__((opencl_unroll_hint(9)))
 454                                 for (int ic = 0; ic < 9; ic++)
 455                                 {
 456                          /* Load activations from SLM into registers  */
 457
 458                                          uint slm_offset = ic * BATCH_PACK * 8 ;
 459
 460                                  act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
 461                                 }
 462
 463                         /* Convolve */
 464
 465                            /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/
 466
 467                                 /*  Output channels 0-7 */
 468
 469                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
 470                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
 471                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
 472                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
 473                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
 474                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
 475                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
 476
 477                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
 478                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
 479                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
 480                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
 481                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
 482                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
 483                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
 484
 485                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
 486                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
 487                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
 488                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
 489                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
 490                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
 491                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
 492
 493                      /* Load weights from SLM into registers - row0, output channels 8..15  */
 494
 495                                 {
 496                                                 __local uint *slm_ptrw0 = slm_ptr1 + 2*8*8;
 497
 498                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 499                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 500                                                 slm_ptrw0                        += slm_read_pixel_offset;
 501
 502                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 503                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 504                                                 slm_ptrw0                        += slm_read_pixel_offset;
 505
 506                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 507                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 508                                 }
 509
 510                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
 511                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
 512                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
 513                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
 514                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
 515                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
 516                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
 517
 518                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
 519                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
 520                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
 521                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
 522                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
 523                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
 524                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
 525
 526                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
 527                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
 528                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
 529                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
 530                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
 531                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
 532                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
 533
 534                                 /* Load weights from SLM into registers - row0, output channels 16..23  */
 535                                 {
 536                                                 __local uint *slm_ptrw0 = slm_ptr1 + 4*8*8;
 537
 538                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 539                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 540                                                 slm_ptrw0                        += slm_read_pixel_offset;
 541
 542                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 543                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 544                                                 slm_ptrw0                        += slm_read_pixel_offset;
 545
 546                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 547                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 548                                 }
 549
 550                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
 551                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
 552                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
 553                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
 554                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
 555                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
 556                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
 557
 558                                 /* load 1Hx9Wx4N inputs, Activation row1   */
 559
 560                                 uint slm_row_offset_2     = 1*(TILE_W + 2)*BATCH_PACK*8;
 561
 562                                 __attribute__((opencl_unroll_hint(9)))
 563                                 for (int ic = 0; ic < 9; ic++)
 564                                 {
 565                          /* Load activations from SLM into registers  */
 566
 567                                          uint slm_offset = slm_row_offset_2 + ic * BATCH_PACK * 8 ;
 568
 569                                  act_reg [ ic + 9 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
 570                                 }
 571
 572                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
 573                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
 574                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
 575                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
 576                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
 577                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
 578                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
 579
 580                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
 581                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
 582                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
 583                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
 584                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
 585                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
 586                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
 587
 588                                 /* Load weights from SLM into registers - row0, output channels 24..31  */
 589                                 {
 590                                                 __local uint *slm_ptrw0 = slm_ptr1 + 6*8*8;
 591
 592                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 593                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 594                                                 slm_ptrw0                        += slm_read_pixel_offset;
 595
 596                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 597                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 598                                                 slm_ptrw0                        += slm_read_pixel_offset;
 599
 600                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
 601                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
 602                                 }
 603
 604                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
 605                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
 606                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
 607                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
 608                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
 609                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
 610                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
 611
 612                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
 613                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
 614                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
 615                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
 616                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
 617                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
 618                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
 619
 620                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
 621                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
 622                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
 623                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
 624                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
 625                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
 626                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
 627
 628                         /**********************************************************************************************************
 629                           Second phase - multiply second row of weights  and second row of activations
 630                         ***********************************************************************************************************/
 631
 632                          /* Load weights from SLM into registers - row1, output channels 0..7  */
 633                                 {
 634                                                 __local uint *slm_ptrw1  = slm_ptr1 + 3*slm_read_pixel_offset;
 635
 636                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 637                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 638                                                 slm_ptrw1                        += slm_read_pixel_offset;
 639
 640                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 641                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 642                                                 slm_ptrw1                            += slm_read_pixel_offset;
 643
 644                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 645                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 646                                 }
 647
 648                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[9], weights_reg[0] );
 649                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[10], weights_reg[0] );
 650                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[11], weights_reg[0] );
 651                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[12], weights_reg[0] );
 652                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[13], weights_reg[0] );
 653                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[14], weights_reg[0] );
 654                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[15], weights_reg[0] );
 655
 656                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[10], weights_reg[1] );
 657                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[11], weights_reg[1] );
 658                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[12], weights_reg[1] );
 659                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[13], weights_reg[1] );
 660                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[14], weights_reg[1] );
 661                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[15], weights_reg[1] );
 662                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[16], weights_reg[1] );
 663
 664                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[11], weights_reg[2] );
 665                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[12], weights_reg[2] );
 666                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[13], weights_reg[2] );
 667                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[14], weights_reg[2] );
 668                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[15], weights_reg[2] );
 669                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[16], weights_reg[2] );
 670                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[17], weights_reg[2] );
 671
 672                                     /* Load weights from SLM into registers - row1, output channels 8..15  */
 673                                 {
 674                                                 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 2*8*8;
 675
 676                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 677                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 678                                                 slm_ptrw1                          += slm_read_pixel_offset;
 679
 680                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 681                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 682                                                 slm_ptrw1                          += slm_read_pixel_offset;
 683
 684                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 685                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 686                                 }
 687
 688                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[9], weights_reg[0] );
 689                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[10], weights_reg[0] );
 690                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[11], weights_reg[0] );
 691                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[12], weights_reg[0] );
 692                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[13], weights_reg[0] );
 693                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[14], weights_reg[0] );
 694                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[15], weights_reg[0] );
 695
 696                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[10], weights_reg[1] );
 697                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[11], weights_reg[1] );
 698                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[12], weights_reg[1] );
 699                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[13], weights_reg[1] );
 700                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[14], weights_reg[1] );
 701                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[15], weights_reg[1] );
 702                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[16], weights_reg[1] );
 703
 704                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[11], weights_reg[2] );
 705                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[12], weights_reg[2] );
 706                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[13], weights_reg[2] );
 707                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[14], weights_reg[2] );
 708                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[15], weights_reg[2] );
 709                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[16], weights_reg[2] );
 710                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[17], weights_reg[2] );
 711
 712                                 /* Load weights from SLM into registers - row1, output channels 16..23  */
 713                                 {
 714                                                 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 4*8*8;
 715
 716                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 717                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 718                                                 slm_ptrw1                          += slm_read_pixel_offset;
 719
 720                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 721                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 722                                                 slm_ptrw1                          += slm_read_pixel_offset;
 723
 724                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 725                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 726                                 }
 727
 728                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[9], weights_reg[0] );
 729                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[10], weights_reg[0] );
 730                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[11], weights_reg[0] );
 731                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[12], weights_reg[0] );
 732                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[13], weights_reg[0] );
 733                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[14], weights_reg[0] );
 734                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[15], weights_reg[0] );
 735
 736                                 /* load 1Hx9Wx4N inputs, Activation row2  */
 737
 738                                 uint slm_row_offset_3     = 2*(TILE_W + 2)*BATCH_PACK*8;
 739
 740                                 __attribute__((opencl_unroll_hint(9)))
 741                                 for (int ic = 0; ic < 9; ic++)
 742                                 {
 743                          /* Load activations from SLM into registers  */
 744
 745                                          uint slm_offset = slm_row_offset_3 + ic * BATCH_PACK * 8 ;
 746
 747                                  act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
 748                                 }
 749
 750                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[10], weights_reg[1] );
 751                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[11], weights_reg[1] );
 752                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[12], weights_reg[1] );
 753                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[13], weights_reg[1] );
 754                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[14], weights_reg[1] );
 755                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[15], weights_reg[1] );
 756                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[16], weights_reg[1] );
 757
 758                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[11], weights_reg[2] );
 759                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[12], weights_reg[2] );
 760                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[13], weights_reg[2] );
 761                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[14], weights_reg[2] );
 762                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[15], weights_reg[2] );
 763                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[16], weights_reg[2] );
 764                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[17], weights_reg[2] );
 765
 766                                 /* Load weights from SLM into registers - row1, output channels 24..31  */
 767                                 {
 768                                                 __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 6*8*8;
 769
 770                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 771                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 772                                                 slm_ptrw1                          += slm_read_pixel_offset;
 773
 774                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 775                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 776                                                 slm_ptrw1                          += slm_read_pixel_offset;
 777
 778                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
 779                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
 780                                 }
 781
 782                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[9], weights_reg[0] );
 783                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[10], weights_reg[0] );
 784                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[11], weights_reg[0] );
 785                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[12], weights_reg[0] );
 786                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[13], weights_reg[0] );
 787                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[14], weights_reg[0] );
 788                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[15], weights_reg[0] );
 789
 790                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[10], weights_reg[1] );
 791                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[11], weights_reg[1] );
 792                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[12], weights_reg[1] );
 793                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[13], weights_reg[1] );
 794                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[14], weights_reg[1] );
 795                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[15], weights_reg[1] );
 796                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[16], weights_reg[1] );
 797
 798                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[11], weights_reg[2] );
 799                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[12], weights_reg[2] );
 800                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[13], weights_reg[2] );
 801                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[14], weights_reg[2] );
 802                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[15], weights_reg[2] );
 803                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[16], weights_reg[2] );
 804                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[17], weights_reg[2] );
 805
 806                         /**********************************************************************************************************
 807                           Third phase - multiply third row of weights  and third row of activations
 808                         ***********************************************************************************************************/
 809
 810                                  /* Load weights from SLM into registers - row2, output channels 0..7  */
 811                                 {
 812                                                 __local uint *slm_ptrw2  = slm_ptr1 + 6*slm_read_pixel_offset;
 813
 814                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 815                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 816                                                 slm_ptrw2                          += slm_read_pixel_offset;
 817
 818                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 819                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 820                                                 slm_ptrw2                            += slm_read_pixel_offset;
 821
 822                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 823                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 824                                 }
 825
 826                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
 827                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
 828                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
 829                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
 830                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
 831                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
 832                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
 833
 834                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
 835                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
 836                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
 837                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
 838                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
 839                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
 840                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
 841
 842                                 out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
 843                                 out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
 844                                 out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
 845                                 out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
 846                                 out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
 847                                 out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
 848                                 out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
 849
 850                                      /* Load weights from SLM into registers - row2, output channels 8..15  */
 851                                 {
 852                                                 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 2*8*8;
 853
 854                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 855                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 856                                                 slm_ptrw2                          += slm_read_pixel_offset;
 857
 858                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 859                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 860                                                 slm_ptrw2                          += slm_read_pixel_offset;
 861
 862                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 863                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 864                                 }
 865
 866                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
 867                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
 868                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
 869                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
 870                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
 871                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
 872                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
 873
 874                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
 875                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
 876                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
 877                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
 878                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
 879                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
 880                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
 881
 882                                 out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
 883                                 out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
 884                                 out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
 885                                 out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
 886                                 out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
 887                                 out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
 888                                 out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
 889
 890                                 /* Load weights from SLM into registers - row2, output channels 16..23  */
 891                                 {
 892                                                 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 4*8*8;
 893
 894                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 895                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 896                                                 slm_ptrw2                          += slm_read_pixel_offset;
 897
 898                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 899                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 900                                                 slm_ptrw2                          += slm_read_pixel_offset;
 901
 902                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 903                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 904                                 }
 905
 906                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
 907                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
 908                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
 909                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
 910                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
 911                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
 912                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
 913
 914                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
 915                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
 916                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
 917                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
 918                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
 919                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
 920                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
 921
 922                                 out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
 923                                 out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
 924                                 out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
 925                                 out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
 926                                 out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
 927                                 out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
 928                                 out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
 929
 930                                 /* Load weights from SLM into registers - row3, output channels 24..31  */
 931                                 {
 932                                                 __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 6*8*8;
 933
 934                                             weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 935                                             weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 936                                                 slm_ptrw2                          += slm_read_pixel_offset;
 937
 938                                                 weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 939                                             weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 940                                                 slm_ptrw2                          += slm_read_pixel_offset;
 941
 942                                                 weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
 943                                             weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
 944                                 }
 945
 946                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
 947                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
 948                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
 949                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
 950                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
 951                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
 952                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
 953
 954                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
 955                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
 956                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
 957                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
 958                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
 959                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
 960                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
 961
 962                                 out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
 963                                 out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
 964                                 out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
 965                                 out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
 966                                 out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
 967                                 out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
 968                                 out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
 969                 }
 970
 971                         // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM
 972                         barrier(CLK_LOCAL_MEM_FENCE);
 973         } //for kd
 974
 975         /****************************************************************************************************************
 976                 *******************************Output Write Stage****************************************************************
 977                 ****************************************************************************************************************/
 978                         /*
 979                    Outputs will be passed through activation function and quantized to 8 bits before writing
 980                    Output will be in same format as input [K/32][N/4][P][Q][4N][32K] */
 981
 982                         /******************* Write output to SLM *************************************/
 983
 984                 /*  Quantize and pack 4x1 byte - from consectuive n-coordinates
 985                         Each thread produces [1P][7Q][4N][32K]
 986                 Write uint32 from each lane to SLM , the entire thread will write 32-consecutive K-coorindates
 987
 988                         Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK )
 989                         In SLM 7x7x4x32 present first then the next 32 channels
 990                 */
 991
 992                 if( lid_z <= 6 )
 993         {
 994                         /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */
 995                         uint row_size_bytes        = (_OW + OWPAD) * PACK * BATCH_PACK;
 996
 997                         /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */
 998                         uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD);
 999
1000                         /* Each output_depth WG writes 64 output channels */
1001
1002                         uint output_depth_index      =  output_depth*2 + threadid_mod_2;
1003                         uint batch_index                         =  batch;
1004
1005                         /* Each WG produces entire 7x7 output, hence no group_y, group_z tiling */
1006
1007             uint output_offset_x = groupy_tile * OUT_X_PITCH;
1008             uint output_offset_y = groupz_tile * OUT_Y_PITCH;
1009                         uint slice_pack_addr_bytes  = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + lid_z * row_size_bytes;
1010
1011                         __global uchar* output_write_ptr = (__global uchar *) &outputs [ slice_pack_addr_bytes + output_offset_x + output_offset_y ];
1012
1013                 const uint feature = output_depth_index * 32 + get_sub_group_local_id();
1014
1015                 const float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
1016                 const float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
1017                 const float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
1018
1019                 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
1020                                 for (int col = 0; col < OUT_BLOCK_WIDTH; col++)
1021                 {
1022
1023                                         int4 outvec0 = out_07[col];
1024                                         int4 outvec1 = out_815[col];
1025                                         int4 outvec2 = out_1623[col];
1026                                         int4 outvec3 = out_2431[col];
1027
1028                                         /* Non-Linear Activation & Quantization code */
1029
1030                                         uchar8 out_write_N2K4[2];
1031
1032                     QUANTIZATION;
1033
1034                                         intel_sub_group_block_write_uc8 (  output_write_ptr  , out_write_N2K4[0] );
1035                                         output_write_ptr += 64;
1036                                         intel_sub_group_block_write_uc8 (  output_write_ptr  , out_write_N2K4[1] );
1037                                         output_write_ptr += 64;
1038
1039                                 } // out_block_width-for loop
1040                 }//lid_z loop
1041 } //end of kernel
1042
1043 #undef SCAL
1044 #undef QUANTIZATION