2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "include/include_all.cl"
19 #if defined(cl_intel_subgroups_short)
21 #define TILE_K FILTER_SIZE_X
24 __attribute__((intel_reqd_sub_group_size(16)))
25 KERNEL(convolution_f16)(
26 const __global half *src0,
28 const __global half *src1,
30 const __global half *bias,
34 #include "include/vec_typedefs.cl"
36 const unsigned group_x = get_group_id(0);
37 const unsigned group_y = get_group_id(1);
38 const unsigned global_x = get_global_id(0);
39 const unsigned global_y = get_global_id(1);
40 const unsigned global_z = get_global_id(2);
42 unsigned interleaved_y;
46 // Result ctile (*dst) is M rows x N columns
47 // LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile.
48 half16 blockC00 = 0.f;
49 half16 blockC10 = 0.f;
51 const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
52 // Src0 (patch input) is directly used as atile.
53 // Each work item points to the start of a different patch.
54 // atile is M rows x K columns.
55 #if defined(INPUT_BUFFER_WIDTH_PADDED) && defined(INPUT_BUFFER_HEIGHT_PADDED)
56 const uint src0_read_offset_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
57 + INPUT0_BATCH_PITCH * global_z // batch offset
58 + ( ( global_y / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset
59 + ( ( global_y % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset
60 #elif !defined(INPUT_BUFFER_WIDTH_PADDED) && !defined(INPUT_BUFFER_HEIGHT_PADDED)
61 #pragma error - fix this path
62 const int y_offset = ( global_y / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y - PADDING_SIZE_Y;
63 const int x_offset = ( global_y % OUTPUT_SIZE_X ) * STRIDE_SIZE_X - PADDING_SIZE_X;
64 uint src0_read_offset = INPUT_OFFSET + in_split_offset + INPUT0_BATCH_PITCH * global_z
65 + y_offset * INPUT0_Y_PITCH;
67 int partial_left = 0, partial_right = 0;
70 partial_left = min((int) FILTER_SIZE_X, (int) abs(x_offset));
71 src0_read_offset -= partial_left;
76 src0_read_offset += x_offset;
78 if ((x_offset + FILTER_SIZE_X) >= INPUT_SIZE_X)
79 partial_right = min(FILTER_SIZE_X, INPUT_SIZE_X - x_offset);
81 partial_right = FILTER_SIZE_X;
83 #elif defined(INPUT_BUFFER_WIDTH_PADDED)
84 #pragma error - fix this path
85 // TODO: Handle offset
86 const int y_offset = ( global_y / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y -PADDING_SIZE_Y;
87 int src0_read_offset = in_split_offset + INPUT0_BATCH_PITCH * global_z // batch offset
88 + y_offset * INPUT0_Y_PITCH // y offset
89 + ( ( global_y % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset
92 // Src1 (filter) is directly used as btile.
93 // It starts at the top of src1 and walks down.
94 // btile is K rows x N columns.
95 uint src0_read_offset = src0_read_offset_const;
96 uint src1_read_offset = ( global_x * TILE_N * 2);
98 #define DOT_PRODUCT_16( _result, _rowA, colB ) \
100 _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
101 _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
102 _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
103 _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
104 _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
105 _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
106 _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
107 _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
108 _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
109 _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
110 _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
111 _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
112 _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
113 _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
114 _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
115 _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
117 typedef CAT( half, FILTER_SIZE_X ) half_t;
118 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
119 // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch
120 // and FILTER_SIZE_X/2 rows of interleaved filter.
121 unsigned patch_depth = 0;
122 __attribute__((opencl_unroll_hint(1)))
126 __attribute__((opencl_unroll_hint(1)))
129 // Load atile and btile.
130 // Kernel data is partially interleaved. Every 2 rows are interleaved at half16 granularity.
131 // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved. The non
132 // interleaved row is padded with zero to ensure same size as interleaved rows. This
133 // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
134 // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
135 // (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
136 // (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ...
137 // (0, 2) (16, 2) (32, 2) (48, 2) ... ...
139 const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1;
140 #if defined(INPUT_BUFFER_WIDTH_PADDED) && defined(INPUT_BUFFER_HEIGHT_PADDED)
142 // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop
143 half blockA00[FILTER_SIZE_X];
146 LOOP(FILTER_SIZE_X, i,
149 if(src0_read_offset_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
151 if(src0_read_offset + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
152 blockA00[i] = src0[src0_read_offset + i];
156 blockA00[i] = src0[src0_read_offset + i];
160 half* pblockA00 = (half*)(&blockA00);
162 #elif !defined(INPUT_BUFFER_WIDTH_PADDED) && !defined(INPUT_BUFFER_HEIGHT_PADDED)
163 // TODO: Fixed vload issue in this path.
166 half* pblockA00 = (half*)(&blockA00);
167 #if (PADDING_SIZE_X == 1) && (INPPUT_PADDING_Y == 1) && (FILTER_SIZE_X == 3) && (FILTER_SIZE_Y == 3)
168 if ((y_offset + patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y))
174 blockA00 = src0[src0_read_offset - partial_left];
175 if (partial_left) pblockA00[0] = 0;
176 if (partial_right != FILTER_SIZE_X) pblockA00[FILTER_SIZE_X - 1] = 0;
179 if ((y_offset + patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y))
185 blockA00 = src0[src0_read_offset - partial_left];
186 for (unsigned i = 0; i < partial_left; ++i) pblockA00[i] = 0;
187 for (unsigned i = partial_right; i < FILTER_SIZE_X; ++i) pblockA00[i] = 0;
191 #elif defined(INPUT_BUFFER_WIDTH_PADDED)
192 // TODO: Fixed vload issue in this path.
194 if ((y_offset + patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y))
200 blockA00 = src0[src0_read_offset];
203 src0_read_offset += INPUT0_Y_PITCH;
205 ushort blockB00[FILTER_SIZE_X * 2];
206 ushort4* p4BlockB00 = (ushort4*)blockB00;
207 ushort2* p2BlockB00 = (ushort2*)blockB00;
208 half* pBlockB00 = (half*)blockB00;
211 LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
213 p4BlockB00[interleaved_y] = intel_sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
214 src1_read_offset += ALIGNED_OFM * 2;
216 if ( kernel_width_is_odd )
218 p2BlockB00[FILTER_SIZE_X - 1] = intel_sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
219 src1_read_offset += ALIGNED_OFM * 2;
225 LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
227 kernel_y = interleaved_y * 2;
228 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
229 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
230 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
231 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
233 if ( kernel_width_is_odd )
235 kernel_y = interleaved_y * 2;
236 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
237 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
240 while( ++patch_row < FILTER_SIZE_Y );
242 src0_read_offset += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
244 while ( ++patch_depth < INPUT0_FEATURE_NUM );
246 #undef DOT_PRODUCT_16
248 const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
249 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
250 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
251 __global half *out = dst + OUTPUT_OFFSET + out_split_offset
252 + global_z * OUTPUT_BATCH_PITCH // batch offset
253 + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset
254 + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset
255 + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset
258 if (global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
261 __global half16* biasPtr = (__global half16*) (bias + group_x * TILE_N);
264 #if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
267 blockC00 += *biasPtr;
268 blockC10 += *(biasPtr + 1);
271 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
272 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
274 for (unsigned i = 0; i < 16; i++)
276 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
277 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
280 #elif ( ( OUTPUT_FEATURE_NUM % 16 ) == 0 )
281 if ( ( global_x + 1 ) < get_global_size(0) )
284 blockC00 += *biasPtr;
285 blockC10 += *(biasPtr + 1);
288 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
289 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
291 for ( unsigned i = 0; i < 16; i++ )
293 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
294 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
300 blockC00 += *biasPtr;
303 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
305 for (unsigned i = 0; i < 16; i++)
307 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
311 if ( ( global_x + 1 ) < get_global_size(0) )
314 blockC00 += *biasPtr;
315 blockC10 += *(biasPtr + 1);
318 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
319 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
321 for ( unsigned i = 0; i < 16; i++ )
323 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
324 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
329 #if ( (OUTPUT_FEATURE_NUM % TILE_N) > 16 )
332 blockC00 += *biasPtr;
333 blockC10 += *(biasPtr + 1);
336 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
337 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
339 for (unsigned i = 0; i < 16 ; i++)
341 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
343 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 16 ; i++)
345 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
349 blockC00 += *biasPtr;
352 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
354 for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 16 ; i++)
356 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];