Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / fused_conv_eltwise_gpu_gemm_fp32.cl
1 /*
2 // Copyright (c) 2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "include/include_all.cl"
18 #include "include/sub_group.cl"
19 #include "include/fetch.cl"
20
21 #define TILE_M          2
22 #define TILE_K          FILTER_SIZE_X
23 #define TILE_N          32
24
25 inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset)(uint out_offset, uint strideX, uint strideY)
26 {
27 // bfyx
28     uint tmp_idx = out_offset;
29     uint x_idx = tmp_idx % OUTPUT_SIZE_X;
30     x_idx *= strideX;
31     tmp_idx /= OUTPUT_SIZE_X;
32     uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
33     y_idx *= strideY;
34     tmp_idx /= OUTPUT_SIZE_Y;
35     uint f_idx = tmp_idx % OUTPUT_FEATURE_NUM;
36     tmp_idx /= OUTPUT_FEATURE_NUM;
37     uint b_idx = tmp_idx % OUTPUT_BATCH_NUM;
38
39     return GET_DATA_INDEX(INPUT1, b_idx, f_idx, y_idx, x_idx);
40 }
41
42 __attribute__((intel_reqd_sub_group_size(8)))
43 KERNEL(fused_conv_eltwise_gemm_fp32)(
44     const __global float *src0,
45     __global float *dst,
46     const __global float *src1,
47 #if BIAS_TERM
48     const __global float *bias,
49 #endif
50     uint split_idx,
51     const __global float* src3)
52 {
53 #include "include/vec_typedefs.cl"
54
55     const unsigned group_x = get_group_id(0);
56     const unsigned group_y = get_group_id(1);
57     const unsigned global_x = get_global_id(0);
58     const unsigned global_y = get_global_id(1);
59     const unsigned global_z = get_global_id(2);
60
61     unsigned interleaved_y;
62     unsigned kernel_y;
63     unsigned kernel_idx;
64
65     // Result ctile (*dst) is M rows x N columns
66     // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
67     float8  blockC00 = 0.f;
68     float8  blockC10 = 0.f;
69     float8  blockC20 = 0.f;
70     float8  blockC30 = 0.f;
71     float8  blockC01 = 0.f;
72     float8  blockC11 = 0.f;
73     float8  blockC21 = 0.f;
74     float8  blockC31 = 0.f;
75
76     const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
77     // Src0 (patch input) is directly used as atile.
78     // Each work item points to the start of a different patch.
79     // atile is M rows x K columns.
80     const uint src0_read_offset0_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
81      + INPUT0_BATCH_PITCH * global_z                                                         // batch offset
82      + ( ( ( global_y * TILE_M + 0 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH )    // y offset
83      + ( ( ( global_y * TILE_M + 0 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X );                    // x offset
84     const uint src0_read_offset1_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
85      + INPUT0_BATCH_PITCH * global_z                                                 // batch offset
86      + ( ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH )    // y offset
87      + ( ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X );                    // x offset
88
89     // Src1 (filter) is directly used as btile.
90     // It starts at the top of src1 and walks down.
91     // btile is K rows x N columns.
92     uint src0_read_offset0 = src0_read_offset0_const;
93     uint src0_read_offset1 = src0_read_offset1_const;
94     uint src1_read_offset = ( global_x * TILE_N * 2);
95
96 #define DOT_PRODUCT_8( _result, _rowA, colB )    \
97     {   \
98         _result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \
99         _result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \
100         _result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \
101         _result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \
102         _result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \
103         _result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \
104         _result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \
105         _result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \
106     }
107
108     // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
109     // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch
110     // and FILTER_SIZE_X/2 rows of interleaved filter.
111     unsigned patch_depth = 0;
112     do
113     {
114         unsigned patch_row = 0;
115         do
116         {
117             // Load atile and btile.
118             // Kernel data is partially interleaved.  Every 2 rows are interleaved at float8 granularity.
119             // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved.  The non
120             // interleaved row is padded with zero to ensure same size as interleaved rows. This
121             // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
122             // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
123             // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
124             // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
125             // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
126             // ...
127             const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1;
128
129             float blockA00[FILTER_SIZE_X];
130             float blockA01[FILTER_SIZE_X];
131             
132             // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop
133             {
134                 unsigned i = 0;
135                 LOOP(FILTER_SIZE_X, i, 
136                 {
137 #if LEFTOVERS == 1
138                     if(src0_read_offset0_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
139                     {
140                         if(src0_read_offset0 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
141                             blockA00[i] = src0[src0_read_offset0 + i];
142                     }
143                     else
144 #endif
145                         blockA00[i] = src0[src0_read_offset0 + i];
146
147 #if LEFTOVERS == 1
148                     if(src0_read_offset1_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
149                     {
150                         if(src0_read_offset1 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
151                             blockA01[i] = src0[src0_read_offset1 + i];
152                     }
153                     else
154 #endif
155                         blockA01[i] = src0[src0_read_offset1 + i];
156                 } )
157             }
158
159             float*  pblockA00 = (float*)(&blockA00);
160             float*  pblockA01 = (float*)(&blockA01);
161
162             src0_read_offset0 += INPUT0_Y_PITCH;
163             src0_read_offset1 += INPUT0_Y_PITCH;
164
165
166             float blockB00[FILTER_SIZE_X*4];
167             float8* p8BlockB00 = (float8*)blockB00;
168             float4* p4BlockB00 = (float4*)blockB00;
169             float*  pBlockB00 =  (float* )blockB00;
170
171             interleaved_y = 0;
172             LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
173             {
174                 p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
175                 src1_read_offset += ALIGNED_OFM * 2;
176             } )
177             if ( kernel_width_is_odd )
178             {
179                 p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
180                 src1_read_offset += ALIGNED_OFM * 2;
181             }
182
183             // Perform MADs
184             kernel_idx = 0;
185             interleaved_y = 0;
186             LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
187             {
188                 kernel_y = interleaved_y * 2;
189                 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
190                 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
191                 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
192                 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
193                 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
194                 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
195                 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
196                 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
197                 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
198                 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
199                 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
200                 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
201                 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
202                 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
203                 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
204                 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
205             } )
206             if ( kernel_width_is_odd )
207             {
208                 kernel_y = interleaved_y * 2;
209                 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
210                 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
211                 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
212                 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
213                 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
214                 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
215                 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
216                 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
217             }
218         }
219
220         //while( ++patch_row < 1 ); //debug
221         while( ++patch_row < FILTER_SIZE_Y );
222
223         src0_read_offset0 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
224         src0_read_offset1 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
225     }
226     //while ( ++patch_depth < 1 );  //debug
227     while ( ++patch_depth < INPUT0_FEATURE_NUM );
228
229     const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
230     // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
231     // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
232     __global float *out0 = dst + OUTPUT_OFFSET + out_split_offset
233      + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
234      + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
235      + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                      // y offset
236      + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X );                                      // x offset
237     __global float *out1 = dst + OUTPUT_OFFSET + out_split_offset
238      + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
239      + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
240      + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                  // y offset
241      + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X );                                  // x offset
242
243     #if BIAS_TERM
244     __global float8* biasPtr = (__global float8*) (bias + group_x * TILE_N);
245     #endif
246     
247     uint out0_offset = OUTPUT_OFFSET + out_split_offset
248      + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
249      + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
250      + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                      // y offset
251      + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X );                                      // x offset
252
253      uint out1_offset = OUTPUT_OFFSET + out_split_offset
254      + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
255      + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
256      + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                  // y offset
257      + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ); 
258
259     //-----------------------------------------------------------------------------------------------//
260     // OUTPUT PHASE
261     //-----------------------------------------------------------------------------------------------//
262     if( global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
263     {
264         if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
265         {
266             #if BIAS_TERM
267             blockC00 += *biasPtr;
268             blockC10 += *(biasPtr + 1);
269             blockC20 += *(biasPtr + 2);
270             blockC30 += *(biasPtr + 3);
271             #endif
272
273             blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
274             blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
275             blockC20 = ACTIVATION(blockC20, NL_M, NL_N);
276             blockC30 = ACTIVATION(blockC30, NL_M, NL_N);
277
278             // eltwise
279             uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
280             for(uint i = 0; i < 8; i++)
281             {
282                 blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
283                 blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
284                 blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
285                 blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
286             }
287
288             blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW);
289             blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW);
290             blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW);
291             blockC30 = ACTIVATION_ELTW(blockC30, NL_M_ELTW, NL_N_ELTW);
292             // end eltwise
293
294             for( unsigned i = 0; i < 8; i++ )
295             {
296                 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
297                 out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
298                 out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
299                 out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
300             }
301         }
302         else
303         {
304             if ( ( global_x + 1 ) < get_global_size(0) )
305             {
306                 #if BIAS_TERM
307                 blockC00 += *biasPtr;
308                 blockC10 += *(biasPtr + 1);
309                 blockC20 += *(biasPtr + 2);
310                 blockC30 += *(biasPtr + 3);
311                 #endif
312
313                 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
314                 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
315                 blockC20 = ACTIVATION(blockC20, NL_M, NL_N);
316                 blockC30 = ACTIVATION(blockC30, NL_M, NL_N);
317
318                 // eltwise
319                 uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
320                 for(uint i = 0; i < 8; i++)
321                 {
322                     blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
323                     blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
324                     blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
325                     blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
326                 }
327     
328                 blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW);
329                 blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW);
330                 blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW);
331                 blockC30 = ACTIVATION_ELTW(blockC30, NL_M_ELTW, NL_N_ELTW);
332                 // end eltwise
333
334                 for ( unsigned i = 0; i < 8; i++ )
335                 {
336                     out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
337                     out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
338                     out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
339                     out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
340                 }
341             }
342             else
343             {
344                 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 )
345                 {
346                     #if BIAS_TERM
347                     blockC00 += *biasPtr;
348                     blockC10 += *(biasPtr + 1);
349                     blockC20 += *(biasPtr + 2);
350                     if (( OUTPUT_FEATURE_NUM % TILE_N) > 24 ) blockC30 += *(biasPtr + 3);
351                     #endif
352
353                     blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
354                     blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
355                     blockC20 = ACTIVATION(blockC20, NL_M, NL_N);
356
357                     // remaining output channels
358                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
359                     {
360                         blockC30[i] = ACTIVATION(blockC30[i], NL_M, NL_N);
361                     }
362
363                     // eltwise
364                     uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
365                     for(uint i = 0; i < 8; i++)
366                     {
367                         blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
368                         blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
369                         blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
370                     }
371
372                     // remaining output channels
373                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
374                     {
375                         blockC30[i] += src3[src3_offset + (i + 24 )* INPUT1_FEATURE_PITCH];
376                         blockC30[i] = ACTIVATION_ELTW(blockC30[i], NL_M_ELTW, NL_N_ELTW);
377                     }
378         
379                     blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW);
380                     blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW);
381                     blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW);
382                     // end eltwise
383
384                     for (unsigned i = 0; i < 8; i++)
385                     {
386                         out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
387                         out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
388                         out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
389                     }
390
391                     // remaining output channels
392                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
393                     {
394                         out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
395                     }
396                 }
397                 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 )
398                 {
399                     #if BIAS_TERM
400                     blockC00 += *biasPtr;
401                     blockC10 += *(biasPtr + 1);
402                     if (( OUTPUT_FEATURE_NUM % TILE_N) > 16 )
403                         blockC20 += *(biasPtr + 2);
404                     #endif
405
406                     blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
407                     blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
408
409                     for (unsigned i = 0; i < 8; i++)
410                     {
411                         out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
412                         out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
413                     }
414
415                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
416                     {
417                         out0[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC20[i], NL_M, NL_N);
418
419                     }
420                 }
421                 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 )
422                 {
423                     #if BIAS_TERM
424                     blockC00 += *biasPtr;
425                     if (( OUTPUT_FEATURE_NUM % TILE_N) > 8 )
426                         blockC10 += *(biasPtr + 1);
427                     #endif
428
429                     blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
430
431                     for (unsigned i = 0; i < 8; i++)
432                     {
433                         out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
434                     }
435
436                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
437                     {
438                         out0[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC10[i], NL_M, NL_N);
439                     }
440                 }
441                 else
442                 {
443                     #if BIAS_TERM
444                     blockC00 += *biasPtr;
445                     #endif
446                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
447                     {
448                         out0[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC00[i], NL_M, NL_N);
449                     }
450                 }
451             }
452         }
453     }
454
455     if ((global_y * TILE_M + 1) < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
456     {
457         if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
458         {
459             #if BIAS_TERM
460             blockC01 += *biasPtr;
461             blockC11 += *(biasPtr + 1);
462             blockC21 += *(biasPtr + 2);
463             blockC31 += *(biasPtr + 3);
464             #endif
465
466             blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
467             blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
468             blockC21 = ACTIVATION(blockC21, NL_M, NL_N);
469             blockC31 = ACTIVATION(blockC31, NL_M, NL_N);
470
471             // eltwise
472             uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out1_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
473             for(uint i = 0; i < 8; i++)
474             {
475                 blockC01[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
476                 blockC11[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
477                 blockC21[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
478                 blockC31[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
479             }
480
481             blockC01 = ACTIVATION_ELTW(blockC01, NL_M_ELTW, NL_N_ELTW);
482             blockC11 = ACTIVATION_ELTW(blockC11, NL_M_ELTW, NL_N_ELTW);
483             blockC21 = ACTIVATION_ELTW(blockC21, NL_M_ELTW, NL_N_ELTW);
484             blockC31 = ACTIVATION_ELTW(blockC31, NL_M_ELTW, NL_N_ELTW);
485             // end eltwise
486
487             for( unsigned i = 0; i < 8; i++ )
488             {
489                 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
490                 out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
491                 out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
492                 out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i];
493             }
494         }
495         else
496         {
497             if ( ( global_x + 1 ) < get_global_size(0) )
498             {
499                 #if BIAS_TERM
500                 blockC01 += *biasPtr;
501                 blockC11 += *(biasPtr + 1);
502                 blockC21 += *(biasPtr + 2);
503                 blockC31 += *(biasPtr + 3);
504                 #endif
505
506                 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
507                 blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
508                 blockC21 = ACTIVATION(blockC21, NL_M, NL_N);
509                 blockC31 = ACTIVATION(blockC31, NL_M, NL_N);
510
511                 for ( unsigned i = 0; i < 8; i++ )
512                 {
513                     out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
514                     out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
515                     out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
516                     out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i];
517                 }
518             }
519             else
520             {
521                 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 )
522                 {
523                     #if BIAS_TERM
524                     blockC01 += *biasPtr;
525                     blockC11 += *(biasPtr + 1);
526                     blockC21 += *(biasPtr + 2);
527                     if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 24 ) blockC31 += *(biasPtr + 3);
528                     #endif
529
530                     blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
531                     blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
532                     blockC21 = ACTIVATION(blockC21, NL_M, NL_N);
533
534                     for (unsigned i = 0; i < 8; i++)
535                     {
536                         out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
537                         out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
538                         out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
539                     }
540
541                     // Remaining channels
542                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
543                     {
544                         out1[(24+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC31[i], NL_M, NL_N);
545                     }
546                 }
547                 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 )
548                 {
549                     #if BIAS_TERM
550                     blockC01 += *biasPtr;
551                     blockC11 += *(biasPtr + 1);
552                     if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 16 ) blockC21 += *(biasPtr + 2);
553                     #endif
554
555                     blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
556                     blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
557
558                     for (unsigned i = 0; i < 8; i++)
559                     {
560                         out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
561                         out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
562                     }
563
564                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
565                     {
566                         out1[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC21[i], NL_M, NL_N);
567                     }
568                 }
569                 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 )
570                 {
571                     #if BIAS_TERM
572                     blockC01 += *biasPtr;
573                     if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 8 ) blockC11 += *(biasPtr + 1);
574                     #endif
575
576                     blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
577
578                     for (unsigned i = 0; i < 8; i++)
579                     {
580                         out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
581                     }
582
583                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
584                     {
585                         out1[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC11[i], NL_M, NL_N);
586                     }
587                 }
588                 else
589                 {
590                     #if BIAS_TERM
591                     blockC01 += *biasPtr;
592                     #endif
593
594                     for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
595                     {
596                         out1[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC01[i], NL_M, NL_N);
597                     }
598                 }
599             }
600         }
601     }
602 }