inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl

   1 // Copyright (c) 2018-2020 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/common.cl"
  17 #include "include/fetch.cl"
  18 #include "include/imad.cl"
  19 #include "include/mmad.cl"
  20 #include "include/data_types.cl"
  21
  22 #define TYPE_N_(type, n) type##n
  23 #define TYPE_N(type, n) TYPE_N_(type, n)
  24 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
  25 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
  26 #define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4)
  27 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
  28
  29 #if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0
  30     #define NON_ZERO_INPUT0_PAD_BEFORE
  31 #endif
  32
  33 #if !defined COMPENSATION_TERM || \
  34     (defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE)
  35     #define SHOULD_BALANCE_COMPENSATION
  36 #endif
  37
  38 #if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION
  39     #define SHOULD_USE_DATA_ZP
  40 #endif
  41
  42 #if defined ASYMMETRIC_DATA_QUANTIZATION && \
  43     defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \
  44     defined SHOULD_BALANCE_COMPENSATION
  45     #define SHOULD_USE_DATA_AND_WEIGHTS_ZP
  46 #endif
  47
  48 #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
  49     #define ACCUMULATOR_TYPE_4 TYPE_N(ACCUMULATOR_TYPE, 4)
  50 #endif
  51
  52 #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
  53     #define FILTER_TYPE_16 TYPE_N(FILTER_TYPE, 16)
  54 #endif
  55
  56 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
  57
  58 #if FILTER_LAYOUT_OS_IS_YX_OSV16_ISV16
  59 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, o, i, y, x)
  60 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FSV)
  61 #   define WEIGHTS_IS_PITCH                     (FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y)
  62
  63 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV32_ISV16
  64 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(FILTER, o, i, z, y, x)
  65 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
  66 #   define WEIGHTS_IS_PITCH                     (2 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
  67
  68 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV64_ISV16
  69 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(FILTER, o, i, z, y, x)
  70 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
  71 #   define WEIGHTS_IS_PITCH                     (4 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
  72
  73 #endif
  74
  75 #define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
  76 #define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
  77
  78 #define FSV  16
  79 #define SIMD 16
  80
  81 __attribute__((intel_reqd_sub_group_size(SIMD)))
  82 __attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
  83 KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
  84     const __global INPUT0_TYPE   *conv_input,
  85     __global OUTPUT_TYPE         *output,
  86     const __global FILTER_TYPE    *weights,
  87 #if BIAS_TERM
  88     const __global BIAS_TYPE     *biases,
  89 #endif
  90 #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
  91     const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
  92 #endif
  93 #ifdef ASYMMETRIC_DATA_QUANTIZATION
  94     const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
  95 #endif
  96 #ifdef COMPENSATION_TERM
  97     const __global COMPENSATION_TYPE *compensation,
  98 #endif
  99 #if HAS_FUSED_OPS_DECLS
 100     FUSED_OPS_DECLS,
 101 #endif
 102     uint split_idx)
 103 {
 104     // Use group ids to ease sub-group uniform variables optimization for compiler
 105     const uint out_yx_sg = (uint)get_group_id(0) * OUT_BLOCK_SPATIAL;
 106     uint out_fg = (uint)get_group_id(1) * OUT_BLOCK_FEATURES * SIMD;
 107     const uint out_b = (uint)get_group_id(2);
 108     uint out_f = out_fg + get_sub_group_local_id();
 109
 110     const uint sglid = get_sub_group_local_id();
 111
 112     uint out_x_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 113     uint out_y_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 114
 115     const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
 116     uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
 117     __attribute__((opencl_unroll_hint))
 118     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 119         uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
 120         uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
 121                           ? out_yx_shuffle
 122                           : min(out_yx_shuffle, max_local_yx - 1);
 123         out_x_shuffle[os] = out_yx_clamp % OUTPUT_SIZE_X;
 124         out_y_shuffle[os] = out_yx_clamp / OUTPUT_SIZE_X;
 125     }
 126
 127     const uint ifm_blocks = CEIL_DIV(INPUT0_FEATURE_NUM, FSV);
 128     const uint ifm_blocks_per_sg = ifm_blocks / FEATURE_SLM_SPLIT;
 129     const uint ifm_per_sg = ifm_blocks_per_sg * FSV;
 130
 131     uint feature_offset = 0;
 132     uint feature_blocks = ifm_blocks_per_sg;
 133 #if FEATURE_SLM_SPLIT != 1
 134     feature_offset = get_sub_group_id() * ifm_per_sg;
 135
 136     if (ifm_blocks % FEATURE_SLM_SPLIT != 0) {
 137         bool bigger_sg = get_sub_group_id() < ifm_blocks % FEATURE_SLM_SPLIT;
 138         feature_blocks = bigger_sg ? ifm_blocks_per_sg + 1 : ifm_blocks_per_sg;
 139         feature_offset += bigger_sg ? get_sub_group_id() * FSV : ifm_blocks % FEATURE_SLM_SPLIT * FSV;
 140     }
 141 #endif
 142
 143     uint filter_idx = GET_WEIGHTS_INDEX(out_f, feature_offset, 0, 0, 0);
 144
 145     uint input_idx[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 146     #ifdef SHOULD_USE_DATA_ZP
 147         uint input_x[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 148         uint input_y[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 149     #endif
 150
 151     __attribute__((opencl_unroll_hint))
 152     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 153         #ifdef SHOULD_USE_DATA_ZP
 154             input_x[os] = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
 155             input_y[os] = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
 156             input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y[os], input_x[os]);
 157         #else
 158             uint input_x = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
 159             uint input_y = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
 160             input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y, input_x);
 161         #endif
 162     }
 163
 164     ACCUMULATOR_TYPE dotProd[OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL] = { };
 165
 166     #ifdef SHOULD_USE_DATA_ZP
 167         uint data_zp_idx = feature_offset;
 168         uint4 data_zp_val;
 169     #endif
 170
 171     #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
 172         uint4 weights_zp_val[OUT_BLOCK_FEATURES];
 173         __attribute__((opencl_unroll_hint))
 174         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 175             weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
 176         }
 177         #if INPUT0_FEATURE_NUM % FSV != 0
 178             uint4 weights_zp_vec_partial[OUT_BLOCK_FEATURES];
 179             __attribute__((opencl_unroll_hint))
 180             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 181                 weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
 182                 FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
 183                 __attribute__((opencl_unroll_hint))
 184                 for (uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
 185                     wzp_p[f] = 0;
 186                 }
 187             }
 188         #endif
 189     #endif
 190
 191     __attribute__((opencl_unroll_hint(1)))
 192     for (uint k = 0; k < feature_blocks; ++k) {
 193         #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
 194             #if INPUT0_FEATURE_NUM % FSV != 0
 195                 if (feature_offset + (k + 1) * FSV >= ALIGN(INPUT0_FEATURE_NUM, FSV)) {
 196                     __attribute__((opencl_unroll_hint))
 197                     for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 198                         weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
 199                     }
 200                 }
 201             #endif
 202         #endif
 203
 204         #ifdef SHOULD_USE_DATA_ZP
 205             #if (INPUT0_FEATURE_NUM % FSV != 0)
 206                 data_zp_val = as_uint4(vload16(0, activations_zp + data_zp_idx));
 207             #else
 208                 data_zp_val = vload4(0, (__global uint *)(activations_zp + data_zp_idx));
 209             #endif
 210         #endif
 211
 212         #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
 213             ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OUT_BLOCK_FEATURES];
 214             __attribute__((opencl_unroll_hint))
 215             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 216                 dotProdAZPxWZP[ofb] = 0;
 217                 __attribute__((opencl_unroll_hint))
 218                 for (uint ive = 0; ive < 4; ive++) {
 219                     dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
 220                     IMAD(dotProdAZPxWZP[ofb][ive],
 221                     AS_INPUT0_TYPE_4(data_zp_val[ive]),
 222                     AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
 223                 }
 224             }
 225         #endif
 226
 227         uint4 weights_val[OUT_BLOCK_FEATURES] = { };
 228         __attribute__((opencl_unroll_hint))
 229         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 230             weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
 231         }
 232
 233         uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 234         __attribute__((opencl_unroll_hint))
 235         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 236             #if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
 237                 if (((input_x[os] < 0) || (input_x[os] >= INPUT0_SIZE_X)) ||
 238                     ((input_y[os] < 0) || (input_y[os] >= INPUT0_SIZE_Y))) {
 239                     input_val[os] = data_zp_val;
 240                 } else {
 241             #endif
 242                     input_val[os] = vload4(0, (__global uint *)(conv_input + input_idx[os]));
 243             #if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
 244                 }
 245             #endif
 246         }
 247
 248 #if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
 249         // For some cases compiler spills here due to loop order
 250         // Use suboptimal order to avoid this at cost of instruction dispatch delays.
 251         __attribute__((opencl_unroll_hint))
 252         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 253             __attribute__((opencl_unroll_hint))
 254             for (uint ive = 0; ive < 4; ++ive) {
 255                 __attribute__((opencl_unroll_hint))
 256                 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 257                     #ifdef SHOULD_USE_DATA_ZP
 258                         ACCUMULATOR_TYPE dotProdAZPxW = 0;
 259                         dotProdAZPxW = TO_ACCUMULATOR_TYPE(
 260                         IMAD(dotProdAZPxW,
 261                         AS_INPUT0_TYPE_4(data_zp_val[ive]),
 262                         AS_FILTER_TYPE_4(weights_val[ofb][ive])));
 263                     #endif
 264 #else
 265         __attribute__((opencl_unroll_hint))
 266         for (uint ive = 0; ive < 4; ++ive) {
 267             __attribute__((opencl_unroll_hint))
 268             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 269                 #ifdef SHOULD_USE_DATA_ZP
 270                     ACCUMULATOR_TYPE dotProdAZPxW = 0;
 271                     dotProdAZPxW = TO_ACCUMULATOR_TYPE(
 272                     IMAD(dotProdAZPxW,
 273                     AS_INPUT0_TYPE_4(data_zp_val[ive]),
 274                     AS_FILTER_TYPE_4(weights_val[ofb][ive])));
 275                 #endif
 276                 __attribute__((opencl_unroll_hint))
 277                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 278 #endif
 279                         INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
 280
 281                         dotProd[ofb][os] = IMAD(dotProd[ofb][os],
 282                                                 inputs,
 283                                                 AS_FILTER_TYPE_4(weights_val[ofb][ive]));
 284
 285                         #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
 286                             ACCUMULATOR_TYPE dotProdAxWZP = 0;
 287                             dotProdAxWZP = TO_ACCUMULATOR_TYPE(
 288                             IMAD(dotProdAxWZP,
 289                             inputs,
 290                             AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
 291                             dotProd[ofb][os] -= dotProdAxWZP;
 292                         #endif
 293
 294                         #if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION
 295                             dotProd[ofb][os] -= dotProdAZPxW;
 296                         #endif
 297
 298                         #if (!defined COMPENSATION_TERM && \
 299                                 defined ASYMMETRIC_DATA_QUANTIZATION && \
 300                                 defined ASYMMETRIC_WEIGHTS_QUANTIZATION)
 301                             dotProd[ofb][os] += dotProdAZPxWZP[ofb][ive];
 302                         #endif
 303                 }
 304             }
 305         }
 306
 307         filter_idx += WEIGHTS_IS_PITCH;
 308         __attribute__((opencl_unroll_hint))
 309         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 310             input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
 311         }
 312
 313         #ifdef SHOULD_USE_DATA_ZP
 314             data_zp_idx += FSV;
 315         #endif
 316     }
 317
 318 #if FEATURE_SLM_SPLIT != 1
 319     // Additional local memory reduction for feature split mode
 320 #   if FEATURE_SLM_SPLIT < OUT_BLOCK_FEATURES
 321 #   error convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl - OUT_BLOCK_FEATURES must be less or equal to FEATURE_SLM_SPLIT
 322 #   endif
 323
 324     const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL;
 325     __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
 326
 327     uint sgid_start_idx = get_sub_group_id();
 328     sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
 329     __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
 330
 331     if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
 332         __attribute__((opencl_unroll_hint))
 333         for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
 334             if (get_sub_group_id() == wg) {
 335                 __attribute__((opencl_unroll_hint))
 336                 for (uint ofb = 0; ofb < wg; ++ofb) {
 337                     __attribute__((opencl_unroll_hint))
 338                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 339                         const uint partial_acc_ptr_idx =
 340                             ofb * OUT_BLOCK_SPATIAL * SIMD +
 341                             os * SIMD;
 342                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
 343                     }
 344                 }
 345                 __attribute__((opencl_unroll_hint))
 346                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 347                     dotProd[0][os] = dotProd[wg][os];
 348                 }
 349                 __attribute__((opencl_unroll_hint))
 350                 for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 351                     __attribute__((opencl_unroll_hint))
 352                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 353                         const uint partial_acc_ptr_idx =
 354                             ((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
 355                             ofb * OUT_BLOCK_SPATIAL * SIMD +
 356                             os * SIMD;
 357                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
 358                     }
 359                 }
 360             }
 361         }
 362     } else {
 363         __attribute__((opencl_unroll_hint))
 364         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 365             __attribute__((opencl_unroll_hint))
 366             for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 367                 const uint partial_acc_ptr_idx =
 368                     ofb * OUT_BLOCK_SPATIAL * SIMD +
 369                     os * SIMD;
 370                 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
 371             }
 372         }
 373     }
 374
 375     barrier(CLK_LOCAL_MEM_FENCE);
 376
 377     if (get_sub_group_id() >= OUT_BLOCK_FEATURES)
 378         return;
 379
 380     partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
 381     __attribute__((opencl_unroll_hint))
 382     for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
 383         __attribute__((opencl_unroll_hint))
 384         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 385             const uint partial_acc_ptr_idx =
 386                 wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
 387                 os * SIMD;
 388             dotProd[0][os] += partial_acc_ptr[partial_acc_ptr_idx];
 389         }
 390     }
 391 #endif
 392
 393 #if FEATURE_SLM_SPLIT == 1
 394 #   define FINAL_OUT_BLOCK_FEATURES (OUT_BLOCK_FEATURES)
 395 #else
 396 #   define FINAL_OUT_BLOCK_FEATURES 1
 397     out_f += get_sub_group_id() * SIMD;
 398     out_fg += get_sub_group_id() * SIMD;
 399
 400     if (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OUT_BLOCK_FEATURES != 0 && out_fg >= OUTPUT_FEATURE_NUM)
 401         return;
 402 #endif
 403
 404 #if BIAS_TERM
 405     // Preload bias
 406     BIAS_TYPE bias_val[FINAL_OUT_BLOCK_FEATURES];
 407     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 408         bias_val[ofb] = biases[out_f + ofb * SIMD];
 409     }
 410 #endif
 411
 412 #ifdef COMPENSATION_TERM
 413     COMPENSATION_TYPE comp[FINAL_OUT_BLOCK_FEATURES];
 414     __attribute__((opencl_unroll_hint))
 415     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 416         comp[ofb] = compensation[out_f + ofb * SIMD];
 417     }
 418 #endif
 419
 420     // Convert accumulator type to activation type
 421     ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
 422     __attribute__((opencl_unroll_hint))
 423     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 424         __attribute__((opencl_unroll_hint))
 425         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 426             dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
 427
 428 #if BIAS_TERM
 429             dequantized[ofb][os] += TO_ACTIVATION_TYPE(bias_val[ofb]);
 430 #endif
 431 #ifdef COMPENSATION_TERM
 432             dequantized[ofb][os] += TO_ACTIVATION_TYPE(comp[ofb]);
 433 #endif
 434         }
 435     }
 436
 437     // Fused ops/activation
 438     OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
 439     __attribute__((opencl_unroll_hint))
 440     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 441 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
 442         FUSED_OPS_PRELOAD_SCALAR;
 443 #endif
 444         __attribute__((opencl_unroll_hint))
 445         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 446 #if HAS_FUSED_OPS
 447     #if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
 448             FUSED_OPS_CALC_SCALAR;
 449     #else
 450             FUSED_OPS_SCALAR;
 451     #endif
 452             result[ofb][os] = FUSED_OPS_RESULT_SCALAR;
 453 #else
 454             result[ofb][os] = TO_OUTPUT_TYPE(ACTIVATION(dequantized[ofb][os], ACTIVATION_PARAMS));
 455 #endif
 456         }
 457     }
 458
 459     // Store output
 460     // Check if can use block writes
 461     bool only_x_block = OUTPUT_SIZE_X % OUT_BLOCK_SPATIAL == 0;
 462     bool at_least_one_x_block = OUTPUT_SIZE_X >= OUT_BLOCK_SPATIAL;
 463     bool full_x = out_yx_sg % OUTPUT_SIZE_X <= OUTPUT_SIZE_X - OUT_BLOCK_SPATIAL;
 464     bool can_write_x = only_x_block || (at_least_one_x_block && full_x);
 465
 466     bool no_x_pad = OUTPUT_PAD_BEFORE_SIZE_X == 0 && OUTPUT_PAD_AFTER_SIZE_X == 0;
 467     bool exact_spatial = max_out_yx % OUT_BLOCK_SPATIAL == 0;
 468     bool full_spatial = out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL;
 469     bool can_write_spatial = no_x_pad && (exact_spatial || full_spatial);
 470
 471     bool full_feature_block = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM);
 472
 473     bool can_use_full_block_write =  full_feature_block && (can_write_x || can_write_spatial);
 474     if (can_use_full_block_write) {
 475         uint output_idx = OUTPUT_GET_INDEX(out_b,
 476                                            out_fg,
 477                                            intel_sub_group_shuffle(out_y_shuffle[0], 0),
 478                                            intel_sub_group_shuffle(out_x_shuffle[0], 0));
 479         __attribute__((opencl_unroll_hint))
 480         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 481             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
 482                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
 483                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
 484             if (good_of_block) {
 485                 uint os = 0;
 486 #if OUTPUT_TYPE_SIZE == 1
 487                 for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
 488                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
 489                     __attribute__((opencl_unroll_hint))
 490                     for (uint i = 0; i < 8; ++i) {
 491                         result_val[i] = result[ofb][os + i];
 492                     }
 493                     DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
 494                     output_idx += 8 * SIMD;
 495                 }
 496 #endif
 497 #if OUTPUT_TYPE_SIZE <= 2
 498                 for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
 499                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
 500                     __attribute__((opencl_unroll_hint))
 501                     for (uint i = 0; i < 4; ++i) {
 502                         result_val[i] = result[ofb][os + i];
 503                     }
 504                     DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
 505                     output_idx += 4 * SIMD;
 506                 }
 507 #endif
 508                 for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
 509                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
 510                     __attribute__((opencl_unroll_hint))
 511                     for (uint i = 0; i < 2; ++i) {
 512                         result_val[i] = result[ofb][os + i];
 513                     }
 514                     DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
 515                     output_idx += 2 * SIMD;
 516                 }
 517                 if (OUT_BLOCK_SPATIAL % 2 == 1) {
 518                     OUTPUT_TYPE result_val = result[ofb][os];
 519                     DT_OUTPUT_BLOCK_WRITE(output, output_idx, result_val);
 520                     output_idx += 1 * SIMD;
 521                 }
 522             }
 523             output_idx += OUTPUT_FEATURE_PITCH * FSV - OUT_BLOCK_SPATIAL * SIMD;
 524         }
 525     } else {
 526         uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 527         __attribute__((opencl_unroll_hint))
 528         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 529             output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
 530         }
 531         __attribute__((opencl_unroll_hint))
 532         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 533             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
 534                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
 535                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
 536             if (good_of_block) {
 537                 __attribute__((opencl_unroll_hint))
 538                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 539                     bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
 540                     if (!good_os)
 541                         break;
 542
 543                     uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
 544                     bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
 545
 546                     if (!good_of)
 547                         result[ofb][os] = (OUTPUT_TYPE)0;
 548
 549                     output[output_idx + sglid] = result[ofb][os];
 550                 }
 551             }
 552
 553             __attribute__((opencl_unroll_hint))
 554             for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 555                 output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
 556             }
 557         }
 558     }
 559
 560 #undef FINAL_OUT_BLOCK_FEATURES
 561 }
 562
 563 #undef TYPE_N_
 564 #undef TYPE_N
 565 #undef AS_TYPE_N
 566 #undef AS_TYPE_N_
 567
 568 #undef INPUT0_TYPE_4
 569 #undef AS_INPUT0_TYPE_4
 570
 571 #ifdef NON_ZERO_INPUT0_PAD_BEFORE
 572     #undef NON_ZERO_INPUT0_PAD_BEFORE
 573 #endif
 574
 575 #ifdef SHOULD_BALANCE_COMPENSATION
 576     #undef SHOULD_BALANCE_COMPENSATION
 577 #endif
 578
 579 #ifdef SHOULD_USE_DATA_ZP
 580     #undef SHOULD_USE_DATA_ZP
 581 #endif
 582
 583 #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
 584     #undef SHOULD_USE_DATA_AND_WEIGHTS_ZP
 585 #endif
 586
 587 #ifdef ACCUMULATOR_TYPE_4
 588 #undef ACCUMULATOR_TYPE_4
 589 #endif
 590
 591 #ifdef FILTER_TYPE_16
 592 #undef FILTER_TYPE_16
 593 #endif
 594
 595 #undef AS_FILTER_TYPE_4
 596
 597 #undef CEIL_DIV
 598 #undef ALIGN
 599
 600 #undef SIMD
 601 #undef FSV