ee52ae87747a745550aa5e4c62993adf7847cbbf
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl
1 // Copyright (c) 2018-2019 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/common.cl"
17 #include "include/fetch.cl"
18 #include "include/imad.cl"
19 #include "include/mmad.cl"
20 #include "include/data_types.cl"
21
22 #define FSV  16
23 #define SIMD 16
24
25 #if FILTER_LAYOUT_OS_IS_YX_OSV16_ISV16
26 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, o, i, y, x)
27 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FSV)
28 #   define WEIGHTS_IS_PITCH                     (FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y)
29
30 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV32_ISV16
31 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(FILTER, o, i, z, y, x)
32 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
33 #   define WEIGHTS_IS_PITCH                     (2 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
34
35 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV64_ISV16
36 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(FILTER, o, i, z, y, x)
37 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
38 #   define WEIGHTS_IS_PITCH                     (4 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
39
40 #endif
41
42 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
43 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
44 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
45 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
46
47 #define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
48 #define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
49
50 __attribute__((intel_reqd_sub_group_size(SIMD)))
51 __attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
52 KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
53     const __global INPUT0_TYPE   *conv_input,
54     __global OUTPUT_TYPE         *output,
55     const __global FILTER_TYPE    *weights,
56 #if BIAS_TERM
57     const __global BIAS_TYPE     *biases,
58 #endif
59 #if HAS_FUSED_OPS_DECLS
60     FUSED_OPS_DECLS,
61 #endif
62     uint split_idx)
63 {
64     // Use group ids to ease sub-group uniform variables optimization for compiler
65     const uint out_yx_sg = (uint)get_group_id(0) * OUT_BLOCK_SPATIAL;
66     uint out_fg = (uint)get_group_id(1) * OUT_BLOCK_FEATURES * SIMD;
67     const uint out_b = (uint)get_group_id(2);
68     uint out_f = out_fg + get_sub_group_local_id();
69
70     const uint sglid = get_sub_group_local_id();
71
72     uint out_x_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
73     uint out_y_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
74
75     const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
76     uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
77     __attribute__((opencl_unroll_hint))
78     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
79         uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
80         uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
81                           ? out_yx_shuffle 
82                           : min(out_yx_shuffle, max_local_yx - 1);
83         out_x_shuffle[os] = out_yx_clamp % OUTPUT_SIZE_X;
84         out_y_shuffle[os] = out_yx_clamp / OUTPUT_SIZE_X;
85     }
86
87     const uint ifm_blocks = CEIL_DIV(INPUT0_FEATURE_NUM, FSV);
88     const uint ifm_blocks_per_sg = ifm_blocks / FEATURE_SLM_SPLIT;
89     const uint ifm_per_sg = ifm_blocks_per_sg * FSV;
90
91     uint feature_offset = 0;
92     uint feature_blocks = ifm_blocks_per_sg;
93 #if FEATURE_SLM_SPLIT != 1
94     feature_offset = get_sub_group_id() * ifm_per_sg;
95
96     if (ifm_blocks % FEATURE_SLM_SPLIT != 0) {
97         bool bigger_sg = get_sub_group_id() < ifm_blocks % FEATURE_SLM_SPLIT;
98         feature_blocks = bigger_sg ? ifm_blocks_per_sg + 1 : ifm_blocks_per_sg;
99         feature_offset += bigger_sg ? get_sub_group_id() * FSV : ifm_blocks % FEATURE_SLM_SPLIT * FSV;
100     }
101 #endif
102
103     uint filter_idx = GET_WEIGHTS_INDEX(out_f, feature_offset, 0, 0, 0);
104
105     uint input_idx[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
106     __attribute__((opencl_unroll_hint))
107     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
108         uint input_x = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
109         uint input_y = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
110         input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y, input_x);
111     }
112
113     ACCUMULATOR_TYPE dotProd[OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL] = { };
114
115     __attribute__((opencl_unroll_hint(1)))
116     for (uint k = 0; k < feature_blocks; ++k) {
117         uint4 weights_val[OUT_BLOCK_FEATURES] = { };
118         __attribute__((opencl_unroll_hint))
119         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
120             weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
121         }
122
123         uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
124         __attribute__((opencl_unroll_hint))
125         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
126             input_val[os] = vload4(0, (__global uint *)(conv_input + input_idx[os]));
127         }
128
129 #if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
130         // For some cases compiler spills here due to loop order
131         // Use suboptimal order to avoid this at cost of instruction dispatch delays.
132         __attribute__((opencl_unroll_hint))
133         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
134             __attribute__((opencl_unroll_hint))
135             for (uint ive = 0; ive < 4; ++ive) {
136                 __attribute__((opencl_unroll_hint))
137                 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
138 #else
139         __attribute__((opencl_unroll_hint))
140         for (uint ive = 0; ive < 4; ++ive) {
141             __attribute__((opencl_unroll_hint))
142             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
143                 __attribute__((opencl_unroll_hint))
144                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
145 #endif
146                         dotProd[ofb][os] = IMAD(dotProd[ofb][os],
147                                                 AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD)),
148                                                 AS_FILTER_TYPE_4(weights_val[ofb][ive]));
149                 }
150             }
151         }
152
153         filter_idx += WEIGHTS_IS_PITCH;
154         __attribute__((opencl_unroll_hint))
155         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
156             input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
157         }
158     }
159
160 #if FEATURE_SLM_SPLIT != 1
161     // Additional local memory reduction for feature split mode
162 #   if FEATURE_SLM_SPLIT < OUT_BLOCK_FEATURES
163 #   error convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl - OUT_BLOCK_FEATURES must be less or equal to FEATURE_SLM_SPLIT
164 #   endif
165
166     const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL;
167     __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
168
169     uint sgid_start_idx = get_sub_group_id();
170     sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
171     __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
172
173     if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
174         __attribute__((opencl_unroll_hint))
175         for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
176             if (get_sub_group_id() == wg) {
177                 __attribute__((opencl_unroll_hint))
178                 for (uint ofb = 0; ofb < wg; ++ofb) {
179                     __attribute__((opencl_unroll_hint))
180                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
181                         const uint partial_acc_ptr_idx =
182                             ofb * OUT_BLOCK_SPATIAL * SIMD +
183                             os * SIMD;
184                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
185                     }
186                 }
187                 __attribute__((opencl_unroll_hint))
188                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
189                     dotProd[0][os] = dotProd[wg][os];
190                 }
191                 __attribute__((opencl_unroll_hint))
192                 for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
193                     __attribute__((opencl_unroll_hint))
194                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
195                         const uint partial_acc_ptr_idx =
196                             ((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
197                             ofb * OUT_BLOCK_SPATIAL * SIMD +
198                             os * SIMD;
199                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
200                     }
201                 }
202             }
203         }
204     } else {
205         __attribute__((opencl_unroll_hint))
206         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
207             __attribute__((opencl_unroll_hint))
208             for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
209                 const uint partial_acc_ptr_idx =
210                     ofb * OUT_BLOCK_SPATIAL * SIMD +
211                     os * SIMD;
212                 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
213             }
214         }
215     }
216
217     barrier(CLK_LOCAL_MEM_FENCE);
218
219     if (get_sub_group_id() >= OUT_BLOCK_FEATURES)
220         return;
221
222     partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
223     __attribute__((opencl_unroll_hint))
224     for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
225         __attribute__((opencl_unroll_hint))
226         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
227             const uint partial_acc_ptr_idx =
228                 wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
229                 os * SIMD;
230             dotProd[0][os] += partial_acc_ptr[partial_acc_ptr_idx];
231         }
232     }
233 #endif
234
235 #if FEATURE_SLM_SPLIT == 1
236 #   define FINAL_OUT_BLOCK_FEATURES (OUT_BLOCK_FEATURES)
237 #else
238 #   define FINAL_OUT_BLOCK_FEATURES 1
239     out_f += get_sub_group_id() * SIMD;
240     out_fg += get_sub_group_id() * SIMD;
241
242     if (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OUT_BLOCK_FEATURES != 0 && out_fg >= OUTPUT_FEATURE_NUM)
243         return;
244 #endif
245
246 #if BIAS_TERM
247     // Preload bias
248     BIAS_TYPE bias_val[FINAL_OUT_BLOCK_FEATURES];
249     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
250         bias_val[ofb] = biases[out_f + ofb * SIMD];
251     }
252 #endif
253
254     // Convert accumulator type to activation type
255     ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
256     __attribute__((opencl_unroll_hint))
257     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
258         __attribute__((opencl_unroll_hint))
259         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
260             dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
261
262 #if BIAS_TERM
263             dequantized[ofb][os] += TO_ACTIVATION_TYPE(bias_val[ofb]);
264 #endif
265         }
266     }
267
268     // Fused ops/activation
269     OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
270     __attribute__((opencl_unroll_hint))
271     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
272 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
273         FUSED_OPS_PRELOAD_SCALAR;
274 #endif
275         __attribute__((opencl_unroll_hint))
276         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
277 #if HAS_FUSED_OPS
278     #if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
279             FUSED_OPS_CALC_SCALAR;
280     #else
281             FUSED_OPS_SCALAR;
282     #endif
283             result[ofb][os] = FUSED_OPS_RESULT_SCALAR;
284 #else
285             result[ofb][os] = TO_OUTPUT_TYPE(ACTIVATION(dequantized[ofb][os], ACTIVATION_PARAMS));
286 #endif
287         }
288     }
289
290     // Store output
291     // Check if can use block writes
292     bool only_x_block = OUTPUT_SIZE_X % OUT_BLOCK_SPATIAL == 0;
293     bool at_least_one_x_block = OUTPUT_SIZE_X >= OUT_BLOCK_SPATIAL;
294     bool full_x = out_yx_sg % OUTPUT_SIZE_X <= OUTPUT_SIZE_X - OUT_BLOCK_SPATIAL;
295     bool can_write_x = only_x_block || (at_least_one_x_block && full_x);
296
297     bool no_x_pad = OUTPUT_PAD_BEFORE_SIZE_X == 0 && OUTPUT_PAD_AFTER_SIZE_X == 0;
298     bool exact_spatial = max_out_yx % OUT_BLOCK_SPATIAL == 0;
299     bool full_spatial = out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL;
300     bool can_write_spatial = no_x_pad && (exact_spatial || full_spatial);
301
302     bool full_feature_block = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM);
303
304     bool can_use_full_block_write =  full_feature_block && (can_write_x || can_write_spatial);
305     if (can_use_full_block_write) {
306         uint output_idx = OUTPUT_GET_INDEX(out_b,
307                                            out_fg,
308                                            intel_sub_group_shuffle(out_y_shuffle[0], 0),
309                                            intel_sub_group_shuffle(out_x_shuffle[0], 0));
310         __attribute__((opencl_unroll_hint))
311         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
312             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
313                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
314                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
315             if (good_of_block) {
316                 uint os = 0;
317 #if OUTPUT_TYPE_SIZE == 1
318                 for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
319                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
320                     __attribute__((opencl_unroll_hint))
321                     for (uint i = 0; i < 8; ++i) {
322                         result_val[i] = result[ofb][os + i];
323                     }
324                     DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
325                     output_idx += 8 * SIMD;
326                 }
327 #endif
328 #if OUTPUT_TYPE_SIZE <= 2
329                 for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
330                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
331                     __attribute__((opencl_unroll_hint))
332                     for (uint i = 0; i < 4; ++i) {
333                         result_val[i] = result[ofb][os + i];
334                     }
335                     DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
336                     output_idx += 4 * SIMD;
337                 }
338 #endif
339                 for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
340                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
341                     __attribute__((opencl_unroll_hint))
342                     for (uint i = 0; i < 2; ++i) {
343                         result_val[i] = result[ofb][os + i];
344                     }
345                     DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
346                     output_idx += 2 * SIMD;
347                 }
348                 if (OUT_BLOCK_SPATIAL % 2 == 1) {
349                     OUTPUT_TYPE result_val = result[ofb][os];
350                     DT_OUTPUT_BLOCK_WRITE(output, output_idx, result_val);
351                     output_idx += 1 * SIMD;
352                 }
353             }
354             output_idx += OUTPUT_FEATURE_PITCH * FSV - OUT_BLOCK_SPATIAL * SIMD;
355         }
356     } else {
357         uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
358         __attribute__((opencl_unroll_hint))
359         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
360             output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
361         }
362         __attribute__((opencl_unroll_hint))
363         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
364             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
365                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
366                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
367             if (good_of_block) {
368                 __attribute__((opencl_unroll_hint))
369                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
370                     bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
371                     if (!good_os)
372                         break;
373
374                     uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
375                     bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
376
377                     if (!good_of)
378                         result[ofb][os] = (OUTPUT_TYPE)0;
379
380                     output[output_idx + sglid] = result[ofb][os];
381                 }
382             }
383
384             __attribute__((opencl_unroll_hint))
385             for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
386                 output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
387             }
388         }
389     }
390
391 #undef FINAL_OUT_BLOCK_FEATURES
392 }
393
394 #undef AS_INPUT0_TYPE_4
395 #undef AS_FILTER_TYPE_4
396 #undef AS_TYPE_N
397 #undef AS_TYPE_N_
398
399 #undef CEIL_DIV
400 #undef ALIGN
401
402 #undef FSV
403 #undef SIMD