2 // Copyright (c) 2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "include/include_all.cl"
18 #include "include/sub_group.cl"
19 #include "include/fetch.cl"
22 #define TILE_K FILTER_SIZE_X
25 inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset)(uint out_offset, uint strideX, uint strideY)
28 uint tmp_idx = out_offset;
29 uint x_idx = tmp_idx % OUTPUT_SIZE_X;
31 tmp_idx /= OUTPUT_SIZE_X;
32 uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
34 tmp_idx /= OUTPUT_SIZE_Y;
35 uint f_idx = tmp_idx % OUTPUT_FEATURE_NUM;
36 tmp_idx /= OUTPUT_FEATURE_NUM;
37 uint b_idx = tmp_idx % OUTPUT_BATCH_NUM;
39 return GET_DATA_INDEX(INPUT1, b_idx, f_idx, y_idx, x_idx);
42 __attribute__((intel_reqd_sub_group_size(8)))
43 KERNEL(fused_conv_eltwise_gemm_fp32)(
44 const __global float *src0,
46 const __global float *src1,
48 const __global float *bias,
51 const __global float* src3)
53 #include "include/vec_typedefs.cl"
55 const unsigned group_x = get_group_id(0);
56 const unsigned group_y = get_group_id(1);
57 const unsigned global_x = get_global_id(0);
58 const unsigned global_y = get_global_id(1);
59 const unsigned global_z = get_global_id(2);
61 unsigned interleaved_y;
65 // Result ctile (*dst) is M rows x N columns
66 // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
67 float8 blockC00 = 0.f;
68 float8 blockC10 = 0.f;
69 float8 blockC20 = 0.f;
70 float8 blockC30 = 0.f;
71 float8 blockC01 = 0.f;
72 float8 blockC11 = 0.f;
73 float8 blockC21 = 0.f;
74 float8 blockC31 = 0.f;
76 const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
77 // Src0 (patch input) is directly used as atile.
78 // Each work item points to the start of a different patch.
79 // atile is M rows x K columns.
80 const uint src0_read_offset0_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
81 + INPUT0_BATCH_PITCH * global_z // batch offset
82 + ( ( ( global_y * TILE_M + 0 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset
83 + ( ( ( global_y * TILE_M + 0 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset
84 const uint src0_read_offset1_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
85 + INPUT0_BATCH_PITCH * global_z // batch offset
86 + ( ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset
87 + ( ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset
89 // Src1 (filter) is directly used as btile.
90 // It starts at the top of src1 and walks down.
91 // btile is K rows x N columns.
92 uint src0_read_offset0 = src0_read_offset0_const;
93 uint src0_read_offset1 = src0_read_offset1_const;
94 uint src1_read_offset = ( global_x * TILE_N * 2);
96 #define DOT_PRODUCT_8( _result, _rowA, colB ) \
98 _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
99 _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
100 _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
101 _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
102 _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
103 _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
104 _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
105 _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
108 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
109 // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch
110 // and FILTER_SIZE_X/2 rows of interleaved filter.
111 unsigned patch_depth = 0;
114 unsigned patch_row = 0;
117 // Load atile and btile.
118 // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity.
119 // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved. The non
120 // interleaved row is padded with zero to ensure same size as interleaved rows. This
121 // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
122 // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
123 // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
124 // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
125 // (0, 2) (8, 2) (16, 2) (24, 2) ... ...
127 const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1;
129 float blockA00[FILTER_SIZE_X];
130 float blockA01[FILTER_SIZE_X];
132 // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop
135 LOOP(FILTER_SIZE_X, i,
138 if(src0_read_offset0_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
140 if(src0_read_offset0 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
141 blockA00[i] = src0[src0_read_offset0 + i];
145 blockA00[i] = src0[src0_read_offset0 + i];
148 if(src0_read_offset1_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
150 if(src0_read_offset1 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
151 blockA01[i] = src0[src0_read_offset1 + i];
155 blockA01[i] = src0[src0_read_offset1 + i];
159 float* pblockA00 = (float*)(&blockA00);
160 float* pblockA01 = (float*)(&blockA01);
162 src0_read_offset0 += INPUT0_Y_PITCH;
163 src0_read_offset1 += INPUT0_Y_PITCH;
166 float blockB00[FILTER_SIZE_X*4];
167 float8* p8BlockB00 = (float8*)blockB00;
168 float4* p4BlockB00 = (float4*)blockB00;
169 float* pBlockB00 = (float* )blockB00;
172 LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
174 p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
175 src1_read_offset += ALIGNED_OFM * 2;
177 if ( kernel_width_is_odd )
179 p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
180 src1_read_offset += ALIGNED_OFM * 2;
186 LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
188 kernel_y = interleaved_y * 2;
189 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
190 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
191 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
192 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
193 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
194 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
195 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
196 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
197 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
198 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
199 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
200 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
201 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
202 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
203 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
204 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
206 if ( kernel_width_is_odd )
208 kernel_y = interleaved_y * 2;
209 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
210 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
211 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
212 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
213 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
214 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
215 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
216 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
220 //while( ++patch_row < 1 ); //debug
221 while( ++patch_row < FILTER_SIZE_Y );
223 src0_read_offset0 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
224 src0_read_offset1 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
226 //while ( ++patch_depth < 1 ); //debug
227 while ( ++patch_depth < INPUT0_FEATURE_NUM );
229 const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
230 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
231 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
232 __global float *out0 = dst + OUTPUT_OFFSET + out_split_offset
233 + global_z * OUTPUT_BATCH_PITCH // batch offset
234 + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset
235 + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset
236 + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset
237 __global float *out1 = dst + OUTPUT_OFFSET + out_split_offset
238 + global_z * OUTPUT_BATCH_PITCH // batch offset
239 + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset
240 + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset
241 + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ); // x offset
244 __global float8* biasPtr = (__global float8*) (bias + group_x * TILE_N);
247 uint out0_offset = OUTPUT_OFFSET + out_split_offset
248 + global_z * OUTPUT_BATCH_PITCH // batch offset
249 + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset
250 + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset
251 + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset
253 uint out1_offset = OUTPUT_OFFSET + out_split_offset
254 + global_z * OUTPUT_BATCH_PITCH // batch offset
255 + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset
256 + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset
257 + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X );
259 //-----------------------------------------------------------------------------------------------//
261 //-----------------------------------------------------------------------------------------------//
262 if( global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
264 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
267 blockC00 += *biasPtr;
268 blockC10 += *(biasPtr + 1);
269 blockC20 += *(biasPtr + 2);
270 blockC30 += *(biasPtr + 3);
273 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
274 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
275 blockC20 = ACTIVATION(blockC20, NL_M, NL_N);
276 blockC30 = ACTIVATION(blockC30, NL_M, NL_N);
279 uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
280 for(uint i = 0; i < 8; i++)
282 blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
283 blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
284 blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
285 blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
288 blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW);
289 blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW);
290 blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW);
291 blockC30 = ACTIVATION_ELTW(blockC30, NL_M_ELTW, NL_N_ELTW);
294 for( unsigned i = 0; i < 8; i++ )
296 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
297 out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
298 out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
299 out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
304 if ( ( global_x + 1 ) < get_global_size(0) )
307 blockC00 += *biasPtr;
308 blockC10 += *(biasPtr + 1);
309 blockC20 += *(biasPtr + 2);
310 blockC30 += *(biasPtr + 3);
313 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
314 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
315 blockC20 = ACTIVATION(blockC20, NL_M, NL_N);
316 blockC30 = ACTIVATION(blockC30, NL_M, NL_N);
319 uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
320 for(uint i = 0; i < 8; i++)
322 blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
323 blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
324 blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
325 blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
328 blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW);
329 blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW);
330 blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW);
331 blockC30 = ACTIVATION_ELTW(blockC30, NL_M_ELTW, NL_N_ELTW);
334 for ( unsigned i = 0; i < 8; i++ )
336 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
337 out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
338 out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
339 out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
344 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 )
347 blockC00 += *biasPtr;
348 blockC10 += *(biasPtr + 1);
349 blockC20 += *(biasPtr + 2);
350 if (( OUTPUT_FEATURE_NUM % TILE_N) > 24 ) blockC30 += *(biasPtr + 3);
353 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
354 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
355 blockC20 = ACTIVATION(blockC20, NL_M, NL_N);
357 // remaining output channels
358 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
360 blockC30[i] = ACTIVATION(blockC30[i], NL_M, NL_N);
364 uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
365 for(uint i = 0; i < 8; i++)
367 blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
368 blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
369 blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
372 // remaining output channels
373 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
375 blockC30[i] += src3[src3_offset + (i + 24 )* INPUT1_FEATURE_PITCH];
376 blockC30[i] = ACTIVATION_ELTW(blockC30[i], NL_M_ELTW, NL_N_ELTW);
379 blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW);
380 blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW);
381 blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW);
384 for (unsigned i = 0; i < 8; i++)
386 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
387 out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
388 out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
391 // remaining output channels
392 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
394 out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
397 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 )
400 blockC00 += *biasPtr;
401 blockC10 += *(biasPtr + 1);
402 if (( OUTPUT_FEATURE_NUM % TILE_N) > 16 )
403 blockC20 += *(biasPtr + 2);
406 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
407 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
409 for (unsigned i = 0; i < 8; i++)
411 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
412 out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
415 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
417 out0[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC20[i], NL_M, NL_N);
421 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 )
424 blockC00 += *biasPtr;
425 if (( OUTPUT_FEATURE_NUM % TILE_N) > 8 )
426 blockC10 += *(biasPtr + 1);
429 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
431 for (unsigned i = 0; i < 8; i++)
433 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
436 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
438 out0[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC10[i], NL_M, NL_N);
444 blockC00 += *biasPtr;
446 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
448 out0[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC00[i], NL_M, NL_N);
455 if ((global_y * TILE_M + 1) < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
457 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
460 blockC01 += *biasPtr;
461 blockC11 += *(biasPtr + 1);
462 blockC21 += *(biasPtr + 2);
463 blockC31 += *(biasPtr + 3);
466 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
467 blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
468 blockC21 = ACTIVATION(blockC21, NL_M, NL_N);
469 blockC31 = ACTIVATION(blockC31, NL_M, NL_N);
472 uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out1_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
473 for(uint i = 0; i < 8; i++)
475 blockC01[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
476 blockC11[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
477 blockC21[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
478 blockC31[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
481 blockC01 = ACTIVATION_ELTW(blockC01, NL_M_ELTW, NL_N_ELTW);
482 blockC11 = ACTIVATION_ELTW(blockC11, NL_M_ELTW, NL_N_ELTW);
483 blockC21 = ACTIVATION_ELTW(blockC21, NL_M_ELTW, NL_N_ELTW);
484 blockC31 = ACTIVATION_ELTW(blockC31, NL_M_ELTW, NL_N_ELTW);
487 for( unsigned i = 0; i < 8; i++ )
489 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
490 out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
491 out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
492 out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i];
497 if ( ( global_x + 1 ) < get_global_size(0) )
500 blockC01 += *biasPtr;
501 blockC11 += *(biasPtr + 1);
502 blockC21 += *(biasPtr + 2);
503 blockC31 += *(biasPtr + 3);
506 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
507 blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
508 blockC21 = ACTIVATION(blockC21, NL_M, NL_N);
509 blockC31 = ACTIVATION(blockC31, NL_M, NL_N);
511 for ( unsigned i = 0; i < 8; i++ )
513 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
514 out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
515 out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
516 out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i];
521 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 )
524 blockC01 += *biasPtr;
525 blockC11 += *(biasPtr + 1);
526 blockC21 += *(biasPtr + 2);
527 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 24 ) blockC31 += *(biasPtr + 3);
530 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
531 blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
532 blockC21 = ACTIVATION(blockC21, NL_M, NL_N);
534 for (unsigned i = 0; i < 8; i++)
536 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
537 out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
538 out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
541 // Remaining channels
542 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
544 out1[(24+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC31[i], NL_M, NL_N);
547 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 )
550 blockC01 += *biasPtr;
551 blockC11 += *(biasPtr + 1);
552 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 16 ) blockC21 += *(biasPtr + 2);
555 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
556 blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
558 for (unsigned i = 0; i < 8; i++)
560 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
561 out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
564 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
566 out1[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC21[i], NL_M, NL_N);
569 else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 )
572 blockC01 += *biasPtr;
573 if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 8 ) blockC11 += *(biasPtr + 1);
576 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
578 for (unsigned i = 0; i < 8; i++)
580 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
583 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
585 out1[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC11[i], NL_M, NL_N);
591 blockC01 += *biasPtr;
594 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
596 out1[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC01[i], NL_M, NL_N);