[IE CLDNN] Add asymmetric quantization support to fsv16 imad 1x1 convolution kernel...
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl
1 // Copyright (c) 2018-2020 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/common.cl"
17 #include "include/fetch.cl"
18 #include "include/imad.cl"
19 #include "include/mmad.cl"
20 #include "include/data_types.cl"
21
22 #define TYPE_N_(type, n) type##n
23 #define TYPE_N(type, n) TYPE_N_(type, n)
24 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
25 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
26 #define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4)
27 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
28
29 #if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0
30     #define NON_ZERO_INPUT0_PAD_BEFORE
31 #endif
32
33 #if !defined COMPENSATION_TERM || \
34     (defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE)
35     #define SHOULD_BALANCE_COMPENSATION
36 #endif
37
38 #if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION
39     #define SHOULD_USE_DATA_ZP
40 #endif
41
42 #if defined ASYMMETRIC_DATA_QUANTIZATION && \
43     defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \
44     defined SHOULD_BALANCE_COMPENSATION
45     #define SHOULD_USE_DATA_AND_WEIGHTS_ZP
46 #endif
47
48 #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
49     #define ACCUMULATOR_TYPE_4 TYPE_N(ACCUMULATOR_TYPE, 4)
50 #endif
51
52 #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
53     #define FILTER_TYPE_16 TYPE_N(FILTER_TYPE, 16)
54 #endif
55
56 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
57
58 #if FILTER_LAYOUT_OS_IS_YX_OSV16_ISV16
59 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, o, i, y, x)
60 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FSV)
61 #   define WEIGHTS_IS_PITCH                     (FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y)
62
63 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV32_ISV16
64 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(FILTER, o, i, z, y, x)
65 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
66 #   define WEIGHTS_IS_PITCH                     (2 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
67
68 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV64_ISV16
69 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(FILTER, o, i, z, y, x)
70 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
71 #   define WEIGHTS_IS_PITCH                     (4 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
72
73 #endif
74
75 #define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
76 #define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
77
78 #define FSV  16
79 #define SIMD 16
80
81 __attribute__((intel_reqd_sub_group_size(SIMD)))
82 __attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
83 KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
84     const __global INPUT0_TYPE   *conv_input,
85     __global OUTPUT_TYPE         *output,
86     const __global FILTER_TYPE    *weights,
87 #if BIAS_TERM
88     const __global BIAS_TYPE     *biases,
89 #endif
90 #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
91     const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
92 #endif
93 #ifdef ASYMMETRIC_DATA_QUANTIZATION
94     const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
95 #endif
96 #ifdef COMPENSATION_TERM
97     const __global COMPENSATION_TYPE *compensation,
98 #endif
99 #if HAS_FUSED_OPS_DECLS
100     FUSED_OPS_DECLS,
101 #endif
102     uint split_idx)
103 {
104     // Use group ids to ease sub-group uniform variables optimization for compiler
105     const uint out_yx_sg = (uint)get_group_id(0) * OUT_BLOCK_SPATIAL;
106     uint out_fg = (uint)get_group_id(1) * OUT_BLOCK_FEATURES * SIMD;
107     const uint out_b = (uint)get_group_id(2);
108     uint out_f = out_fg + get_sub_group_local_id();
109
110     const uint sglid = get_sub_group_local_id();
111
112     uint out_x_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
113     uint out_y_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
114
115     const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
116     uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
117     __attribute__((opencl_unroll_hint))
118     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
119         uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
120         uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
121                           ? out_yx_shuffle 
122                           : min(out_yx_shuffle, max_local_yx - 1);
123         out_x_shuffle[os] = out_yx_clamp % OUTPUT_SIZE_X;
124         out_y_shuffle[os] = out_yx_clamp / OUTPUT_SIZE_X;
125     }
126
127     const uint ifm_blocks = CEIL_DIV(INPUT0_FEATURE_NUM, FSV);
128     const uint ifm_blocks_per_sg = ifm_blocks / FEATURE_SLM_SPLIT;
129     const uint ifm_per_sg = ifm_blocks_per_sg * FSV;
130
131     uint feature_offset = 0;
132     uint feature_blocks = ifm_blocks_per_sg;
133 #if FEATURE_SLM_SPLIT != 1
134     feature_offset = get_sub_group_id() * ifm_per_sg;
135
136     if (ifm_blocks % FEATURE_SLM_SPLIT != 0) {
137         bool bigger_sg = get_sub_group_id() < ifm_blocks % FEATURE_SLM_SPLIT;
138         feature_blocks = bigger_sg ? ifm_blocks_per_sg + 1 : ifm_blocks_per_sg;
139         feature_offset += bigger_sg ? get_sub_group_id() * FSV : ifm_blocks % FEATURE_SLM_SPLIT * FSV;
140     }
141 #endif
142
143     uint filter_idx = GET_WEIGHTS_INDEX(out_f, feature_offset, 0, 0, 0);
144
145     uint input_idx[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
146     #ifdef SHOULD_USE_DATA_ZP
147         uint input_x[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
148         uint input_y[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
149     #endif
150
151     __attribute__((opencl_unroll_hint))
152     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
153         #ifdef SHOULD_USE_DATA_ZP
154             input_x[os] = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
155             input_y[os] = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
156             input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y[os], input_x[os]);
157         #else
158             uint input_x = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
159             uint input_y = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
160             input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y, input_x);
161         #endif
162     }
163
164     ACCUMULATOR_TYPE dotProd[OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL] = { };
165
166     #ifdef SHOULD_USE_DATA_ZP
167         uint data_zp_idx = feature_offset;
168         uint4 data_zp_val;
169     #endif
170
171     #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
172         uint4 weights_zp_val[OUT_BLOCK_FEATURES];
173         __attribute__((opencl_unroll_hint))
174         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
175             weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
176         }
177         #if INPUT0_FEATURE_NUM % FSV != 0
178             uint4 weights_zp_vec_partial[OUT_BLOCK_FEATURES];
179             __attribute__((opencl_unroll_hint))
180             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
181                 weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
182                 FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
183                 __attribute__((opencl_unroll_hint))
184                 for (uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
185                     wzp_p[f] = 0;
186                 }
187             }
188         #endif
189     #endif
190
191     __attribute__((opencl_unroll_hint(1)))
192     for (uint k = 0; k < feature_blocks; ++k) {
193         #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
194             #if INPUT0_FEATURE_NUM % FSV != 0
195                 if (feature_offset + (k + 1) * FSV >= ALIGN(INPUT0_FEATURE_NUM, FSV)) {
196                     __attribute__((opencl_unroll_hint))
197                     for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
198                         weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
199                     }
200                 }
201             #endif
202         #endif
203
204         #ifdef SHOULD_USE_DATA_ZP
205             #if (INPUT0_FEATURE_NUM % FSV != 0)
206                 data_zp_val = as_uint4(vload16(0, activations_zp + data_zp_idx));
207             #else
208                 data_zp_val = vload4(0, (__global uint *)(activations_zp + data_zp_idx));
209             #endif
210         #endif
211
212         #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
213             ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OUT_BLOCK_FEATURES];
214             __attribute__((opencl_unroll_hint))
215             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
216                 dotProdAZPxWZP[ofb] = 0;
217                 __attribute__((opencl_unroll_hint))
218                 for (uint ive = 0; ive < 4; ive++) {
219                     dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
220                     IMAD(dotProdAZPxWZP[ofb][ive],
221                     AS_INPUT0_TYPE_4(data_zp_val[ive]),
222                     AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
223                 }
224             }
225         #endif
226
227         uint4 weights_val[OUT_BLOCK_FEATURES] = { };
228         __attribute__((opencl_unroll_hint))
229         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
230             weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
231         }
232
233         uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
234         __attribute__((opencl_unroll_hint))
235         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
236             #if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
237                 if (((input_x[os] < 0) || (input_x[os] >= INPUT0_SIZE_X)) ||
238                     ((input_y[os] < 0) || (input_y[os] >= INPUT0_SIZE_Y))) {
239                     input_val[os] = data_zp_val;
240                 } else {
241             #endif
242                     input_val[os] = vload4(0, (__global uint *)(conv_input + input_idx[os]));
243             #if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
244                 }
245             #endif
246         }
247
248 #if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
249         // For some cases compiler spills here due to loop order
250         // Use suboptimal order to avoid this at cost of instruction dispatch delays.
251         __attribute__((opencl_unroll_hint))
252         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
253             __attribute__((opencl_unroll_hint))
254             for (uint ive = 0; ive < 4; ++ive) {
255                 __attribute__((opencl_unroll_hint))
256                 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
257                     #ifdef SHOULD_USE_DATA_ZP
258                         ACCUMULATOR_TYPE dotProdAZPxW = 0;
259                         dotProdAZPxW = TO_ACCUMULATOR_TYPE(
260                         IMAD(dotProdAZPxW,
261                         AS_INPUT0_TYPE_4(data_zp_val[ive]),
262                         AS_FILTER_TYPE_4(weights_val[ofb][ive])));
263                     #endif
264 #else
265         __attribute__((opencl_unroll_hint))
266         for (uint ive = 0; ive < 4; ++ive) {
267             __attribute__((opencl_unroll_hint))
268             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
269                 #ifdef SHOULD_USE_DATA_ZP
270                     ACCUMULATOR_TYPE dotProdAZPxW = 0;
271                     dotProdAZPxW = TO_ACCUMULATOR_TYPE(
272                     IMAD(dotProdAZPxW,
273                     AS_INPUT0_TYPE_4(data_zp_val[ive]),
274                     AS_FILTER_TYPE_4(weights_val[ofb][ive])));
275                 #endif
276                 __attribute__((opencl_unroll_hint))
277                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
278 #endif
279                         INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
280
281                         dotProd[ofb][os] = IMAD(dotProd[ofb][os],
282                                                 inputs,
283                                                 AS_FILTER_TYPE_4(weights_val[ofb][ive]));
284
285                         #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
286                             ACCUMULATOR_TYPE dotProdAxWZP = 0;
287                             dotProdAxWZP = TO_ACCUMULATOR_TYPE(
288                             IMAD(dotProdAxWZP,
289                             inputs,
290                             AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
291                             dotProd[ofb][os] -= dotProdAxWZP;
292                         #endif
293
294                         #if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION
295                             dotProd[ofb][os] -= dotProdAZPxW;
296                         #endif
297
298                         #if (!defined COMPENSATION_TERM && \
299                                 defined ASYMMETRIC_DATA_QUANTIZATION && \
300                                 defined ASYMMETRIC_WEIGHTS_QUANTIZATION)
301                             dotProd[ofb][os] += dotProdAZPxWZP[ofb][ive];
302                         #endif
303                 }
304             }
305         }
306
307         filter_idx += WEIGHTS_IS_PITCH;
308         __attribute__((opencl_unroll_hint))
309         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
310             input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
311         }
312
313         #ifdef SHOULD_USE_DATA_ZP
314             data_zp_idx += FSV;
315         #endif
316     }
317
318 #if FEATURE_SLM_SPLIT != 1
319     // Additional local memory reduction for feature split mode
320 #   if FEATURE_SLM_SPLIT < OUT_BLOCK_FEATURES
321 #   error convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl - OUT_BLOCK_FEATURES must be less or equal to FEATURE_SLM_SPLIT
322 #   endif
323
324     const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL;
325     __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
326
327     uint sgid_start_idx = get_sub_group_id();
328     sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
329     __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
330
331     if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
332         __attribute__((opencl_unroll_hint))
333         for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
334             if (get_sub_group_id() == wg) {
335                 __attribute__((opencl_unroll_hint))
336                 for (uint ofb = 0; ofb < wg; ++ofb) {
337                     __attribute__((opencl_unroll_hint))
338                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
339                         const uint partial_acc_ptr_idx =
340                             ofb * OUT_BLOCK_SPATIAL * SIMD +
341                             os * SIMD;
342                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
343                     }
344                 }
345                 __attribute__((opencl_unroll_hint))
346                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
347                     dotProd[0][os] = dotProd[wg][os];
348                 }
349                 __attribute__((opencl_unroll_hint))
350                 for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
351                     __attribute__((opencl_unroll_hint))
352                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
353                         const uint partial_acc_ptr_idx =
354                             ((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
355                             ofb * OUT_BLOCK_SPATIAL * SIMD +
356                             os * SIMD;
357                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
358                     }
359                 }
360             }
361         }
362     } else {
363         __attribute__((opencl_unroll_hint))
364         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
365             __attribute__((opencl_unroll_hint))
366             for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
367                 const uint partial_acc_ptr_idx =
368                     ofb * OUT_BLOCK_SPATIAL * SIMD +
369                     os * SIMD;
370                 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
371             }
372         }
373     }
374
375     barrier(CLK_LOCAL_MEM_FENCE);
376
377     if (get_sub_group_id() >= OUT_BLOCK_FEATURES)
378         return;
379
380     partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
381     __attribute__((opencl_unroll_hint))
382     for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
383         __attribute__((opencl_unroll_hint))
384         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
385             const uint partial_acc_ptr_idx =
386                 wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
387                 os * SIMD;
388             dotProd[0][os] += partial_acc_ptr[partial_acc_ptr_idx];
389         }
390     }
391 #endif
392
393 #if FEATURE_SLM_SPLIT == 1
394 #   define FINAL_OUT_BLOCK_FEATURES (OUT_BLOCK_FEATURES)
395 #else
396 #   define FINAL_OUT_BLOCK_FEATURES 1
397     out_f += get_sub_group_id() * SIMD;
398     out_fg += get_sub_group_id() * SIMD;
399
400     if (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OUT_BLOCK_FEATURES != 0 && out_fg >= OUTPUT_FEATURE_NUM)
401         return;
402 #endif
403
404 #if BIAS_TERM
405     // Preload bias
406     BIAS_TYPE bias_val[FINAL_OUT_BLOCK_FEATURES];
407     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
408         bias_val[ofb] = biases[out_f + ofb * SIMD];
409     }
410 #endif
411
412 #ifdef COMPENSATION_TERM
413     COMPENSATION_TYPE comp[FINAL_OUT_BLOCK_FEATURES];
414     __attribute__((opencl_unroll_hint))
415     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
416         comp[ofb] = compensation[out_f + ofb * SIMD];
417     }
418 #endif
419
420     // Convert accumulator type to activation type
421     ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
422     __attribute__((opencl_unroll_hint))
423     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
424         __attribute__((opencl_unroll_hint))
425         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
426             dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
427
428 #if BIAS_TERM
429             dequantized[ofb][os] += TO_ACTIVATION_TYPE(bias_val[ofb]);
430 #endif
431 #ifdef COMPENSATION_TERM
432             dequantized[ofb][os] += TO_ACTIVATION_TYPE(comp[ofb]);
433 #endif
434         }
435     }
436
437     // Fused ops/activation
438     OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
439     __attribute__((opencl_unroll_hint))
440     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
441 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
442         FUSED_OPS_PRELOAD_SCALAR;
443 #endif
444         __attribute__((opencl_unroll_hint))
445         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
446 #if HAS_FUSED_OPS
447     #if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
448             FUSED_OPS_CALC_SCALAR;
449     #else
450             FUSED_OPS_SCALAR;
451     #endif
452             result[ofb][os] = FUSED_OPS_RESULT_SCALAR;
453 #else
454             result[ofb][os] = TO_OUTPUT_TYPE(ACTIVATION(dequantized[ofb][os], ACTIVATION_PARAMS));
455 #endif
456         }
457     }
458
459     // Store output
460     // Check if can use block writes
461     bool only_x_block = OUTPUT_SIZE_X % OUT_BLOCK_SPATIAL == 0;
462     bool at_least_one_x_block = OUTPUT_SIZE_X >= OUT_BLOCK_SPATIAL;
463     bool full_x = out_yx_sg % OUTPUT_SIZE_X <= OUTPUT_SIZE_X - OUT_BLOCK_SPATIAL;
464     bool can_write_x = only_x_block || (at_least_one_x_block && full_x);
465
466     bool no_x_pad = OUTPUT_PAD_BEFORE_SIZE_X == 0 && OUTPUT_PAD_AFTER_SIZE_X == 0;
467     bool exact_spatial = max_out_yx % OUT_BLOCK_SPATIAL == 0;
468     bool full_spatial = out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL;
469     bool can_write_spatial = no_x_pad && (exact_spatial || full_spatial);
470
471     bool full_feature_block = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM);
472
473     bool can_use_full_block_write =  full_feature_block && (can_write_x || can_write_spatial);
474     if (can_use_full_block_write) {
475         uint output_idx = OUTPUT_GET_INDEX(out_b,
476                                            out_fg,
477                                            intel_sub_group_shuffle(out_y_shuffle[0], 0),
478                                            intel_sub_group_shuffle(out_x_shuffle[0], 0));
479         __attribute__((opencl_unroll_hint))
480         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
481             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
482                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
483                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
484             if (good_of_block) {
485                 uint os = 0;
486 #if OUTPUT_TYPE_SIZE == 1
487                 for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
488                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
489                     __attribute__((opencl_unroll_hint))
490                     for (uint i = 0; i < 8; ++i) {
491                         result_val[i] = result[ofb][os + i];
492                     }
493                     DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
494                     output_idx += 8 * SIMD;
495                 }
496 #endif
497 #if OUTPUT_TYPE_SIZE <= 2
498                 for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
499                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
500                     __attribute__((opencl_unroll_hint))
501                     for (uint i = 0; i < 4; ++i) {
502                         result_val[i] = result[ofb][os + i];
503                     }
504                     DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
505                     output_idx += 4 * SIMD;
506                 }
507 #endif
508                 for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
509                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
510                     __attribute__((opencl_unroll_hint))
511                     for (uint i = 0; i < 2; ++i) {
512                         result_val[i] = result[ofb][os + i];
513                     }
514                     DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
515                     output_idx += 2 * SIMD;
516                 }
517                 if (OUT_BLOCK_SPATIAL % 2 == 1) {
518                     OUTPUT_TYPE result_val = result[ofb][os];
519                     DT_OUTPUT_BLOCK_WRITE(output, output_idx, result_val);
520                     output_idx += 1 * SIMD;
521                 }
522             }
523             output_idx += OUTPUT_FEATURE_PITCH * FSV - OUT_BLOCK_SPATIAL * SIMD;
524         }
525     } else {
526         uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
527         __attribute__((opencl_unroll_hint))
528         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
529             output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
530         }
531         __attribute__((opencl_unroll_hint))
532         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
533             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
534                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
535                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
536             if (good_of_block) {
537                 __attribute__((opencl_unroll_hint))
538                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
539                     bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
540                     if (!good_os)
541                         break;
542
543                     uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
544                     bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
545
546                     if (!good_of)
547                         result[ofb][os] = (OUTPUT_TYPE)0;
548
549                     output[output_idx + sglid] = result[ofb][os];
550                 }
551             }
552
553             __attribute__((opencl_unroll_hint))
554             for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
555                 output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
556             }
557         }
558     }
559
560 #undef FINAL_OUT_BLOCK_FEATURES
561 }
562
563 #undef TYPE_N_
564 #undef TYPE_N
565 #undef AS_TYPE_N
566 #undef AS_TYPE_N_
567
568 #undef INPUT0_TYPE_4
569 #undef AS_INPUT0_TYPE_4
570
571 #ifdef NON_ZERO_INPUT0_PAD_BEFORE
572     #undef NON_ZERO_INPUT0_PAD_BEFORE
573 #endif
574
575 #ifdef SHOULD_BALANCE_COMPENSATION
576     #undef SHOULD_BALANCE_COMPENSATION
577 #endif
578
579 #ifdef SHOULD_USE_DATA_ZP
580     #undef SHOULD_USE_DATA_ZP
581 #endif
582
583 #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
584     #undef SHOULD_USE_DATA_AND_WEIGHTS_ZP
585 #endif
586
587 #ifdef ACCUMULATOR_TYPE_4
588 #undef ACCUMULATOR_TYPE_4
589 #endif
590
591 #ifdef FILTER_TYPE_16
592 #undef FILTER_TYPE_16
593 #endif
594
595 #undef AS_FILTER_TYPE_4
596
597 #undef CEIL_DIV
598 #undef ALIGN
599
600 #undef SIMD
601 #undef FSV