inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl

   1 // Copyright (c) 2018-2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/common.cl"
  17 #include "include/fetch.cl"
  18 #include "include/imad.cl"
  19 #include "include/mmad.cl"
  20 #include "include/data_types.cl"
  21
  22 #define FSV  16
  23 #define SIMD 16
  24
  25 #if FILTER_LAYOUT_OS_IS_YX_OSV16_ISV16
  26 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, o, i, y, x)
  27 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FSV)
  28 #   define WEIGHTS_IS_PITCH                     (FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y)
  29
  30 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV32_ISV16
  31 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(FILTER, o, i, z, y, x)
  32 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
  33 #   define WEIGHTS_IS_PITCH                     (2 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
  34
  35 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV64_ISV16
  36 #   define GET_WEIGHTS_INDEX(o, i, z, y, x)     GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(FILTER, o, i, z, y, x)
  37 #   define WEIGHTS_FEATURE_BLOCK_PITCH          (FSV * FSV)
  38 #   define WEIGHTS_IS_PITCH                     (4 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
  39
  40 #endif
  41
  42 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
  43 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
  44 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
  45 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
  46
  47 #define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
  48 #define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
  49
  50 __attribute__((intel_reqd_sub_group_size(SIMD)))
  51 __attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
  52 KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
  53     const __global INPUT0_TYPE   *conv_input,
  54     __global OUTPUT_TYPE         *output,
  55     const __global FILTER_TYPE    *weights,
  56 #if BIAS_TERM
  57     const __global BIAS_TYPE     *biases,
  58 #endif
  59 #if HAS_FUSED_OPS_DECLS
  60     FUSED_OPS_DECLS,
  61 #endif
  62     uint split_idx)
  63 {
  64     // Use group ids to ease sub-group uniform variables optimization for compiler
  65     const uint out_yx_sg = (uint)get_group_id(0) * OUT_BLOCK_SPATIAL;
  66     uint out_fg = (uint)get_group_id(1) * OUT_BLOCK_FEATURES * SIMD;
  67     const uint out_b = (uint)get_group_id(2);
  68     uint out_f = out_fg + get_sub_group_local_id();
  69
  70     const uint sglid = get_sub_group_local_id();
  71
  72     uint out_x_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
  73     uint out_y_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
  74
  75     const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
  76     uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
  77     __attribute__((opencl_unroll_hint))
  78     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
  79         uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
  80         uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
  81                           ? out_yx_shuffle
  82                           : min(out_yx_shuffle, max_local_yx - 1);
  83         out_x_shuffle[os] = out_yx_clamp % OUTPUT_SIZE_X;
  84         out_y_shuffle[os] = out_yx_clamp / OUTPUT_SIZE_X;
  85     }
  86
  87     const uint ifm_blocks = CEIL_DIV(INPUT0_FEATURE_NUM, FSV);
  88     const uint ifm_blocks_per_sg = ifm_blocks / FEATURE_SLM_SPLIT;
  89     const uint ifm_per_sg = ifm_blocks_per_sg * FSV;
  90
  91     uint feature_offset = 0;
  92     uint feature_blocks = ifm_blocks_per_sg;
  93 #if FEATURE_SLM_SPLIT != 1
  94     feature_offset = get_sub_group_id() * ifm_per_sg;
  95
  96     if (ifm_blocks % FEATURE_SLM_SPLIT != 0) {
  97         bool bigger_sg = get_sub_group_id() < ifm_blocks % FEATURE_SLM_SPLIT;
  98         feature_blocks = bigger_sg ? ifm_blocks_per_sg + 1 : ifm_blocks_per_sg;
  99         feature_offset += bigger_sg ? get_sub_group_id() * FSV : ifm_blocks % FEATURE_SLM_SPLIT * FSV;
 100     }
 101 #endif
 102
 103     uint filter_idx = GET_WEIGHTS_INDEX(out_f, feature_offset, 0, 0, 0);
 104
 105     uint input_idx[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 106     __attribute__((opencl_unroll_hint))
 107     for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 108         uint input_x = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
 109         uint input_y = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
 110         input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y, input_x);
 111     }
 112
 113     ACCUMULATOR_TYPE dotProd[OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL] = { };
 114
 115     __attribute__((opencl_unroll_hint(1)))
 116     for (uint k = 0; k < feature_blocks; ++k) {
 117         uint4 weights_val[OUT_BLOCK_FEATURES] = { };
 118         __attribute__((opencl_unroll_hint))
 119         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 120             weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
 121         }
 122
 123         uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 124         __attribute__((opencl_unroll_hint))
 125         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 126             input_val[os] = vload4(0, (__global uint *)(conv_input + input_idx[os]));
 127         }
 128
 129 #if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
 130         // For some cases compiler spills here due to loop order
 131         // Use suboptimal order to avoid this at cost of instruction dispatch delays.
 132         __attribute__((opencl_unroll_hint))
 133         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 134             __attribute__((opencl_unroll_hint))
 135             for (uint ive = 0; ive < 4; ++ive) {
 136                 __attribute__((opencl_unroll_hint))
 137                 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 138 #else
 139         __attribute__((opencl_unroll_hint))
 140         for (uint ive = 0; ive < 4; ++ive) {
 141             __attribute__((opencl_unroll_hint))
 142             for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 143                 __attribute__((opencl_unroll_hint))
 144                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 145 #endif
 146                         dotProd[ofb][os] = IMAD(dotProd[ofb][os],
 147                                                 AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD)),
 148                                                 AS_FILTER_TYPE_4(weights_val[ofb][ive]));
 149                 }
 150             }
 151         }
 152
 153         filter_idx += WEIGHTS_IS_PITCH;
 154         __attribute__((opencl_unroll_hint))
 155         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 156             input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
 157         }
 158     }
 159
 160 #if FEATURE_SLM_SPLIT != 1
 161     // Additional local memory reduction for feature split mode
 162 #   if FEATURE_SLM_SPLIT < OUT_BLOCK_FEATURES
 163 #   error convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl - OUT_BLOCK_FEATURES must be less or equal to FEATURE_SLM_SPLIT
 164 #   endif
 165
 166     const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL;
 167     __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
 168
 169     uint sgid_start_idx = get_sub_group_id();
 170     sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
 171     __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
 172
 173     if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
 174         __attribute__((opencl_unroll_hint))
 175         for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
 176             if (get_sub_group_id() == wg) {
 177                 __attribute__((opencl_unroll_hint))
 178                 for (uint ofb = 0; ofb < wg; ++ofb) {
 179                     __attribute__((opencl_unroll_hint))
 180                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 181                         const uint partial_acc_ptr_idx =
 182                             ofb * OUT_BLOCK_SPATIAL * SIMD +
 183                             os * SIMD;
 184                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
 185                     }
 186                 }
 187                 __attribute__((opencl_unroll_hint))
 188                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 189                     dotProd[0][os] = dotProd[wg][os];
 190                 }
 191                 __attribute__((opencl_unroll_hint))
 192                 for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 193                     __attribute__((opencl_unroll_hint))
 194                     for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 195                         const uint partial_acc_ptr_idx =
 196                             ((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
 197                             ofb * OUT_BLOCK_SPATIAL * SIMD +
 198                             os * SIMD;
 199                         partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
 200                     }
 201                 }
 202             }
 203         }
 204     } else {
 205         __attribute__((opencl_unroll_hint))
 206         for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
 207             __attribute__((opencl_unroll_hint))
 208             for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 209                 const uint partial_acc_ptr_idx =
 210                     ofb * OUT_BLOCK_SPATIAL * SIMD +
 211                     os * SIMD;
 212                 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
 213             }
 214         }
 215     }
 216
 217     barrier(CLK_LOCAL_MEM_FENCE);
 218
 219     if (get_sub_group_id() >= OUT_BLOCK_FEATURES)
 220         return;
 221
 222     partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
 223     __attribute__((opencl_unroll_hint))
 224     for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
 225         __attribute__((opencl_unroll_hint))
 226         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 227             const uint partial_acc_ptr_idx =
 228                 wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
 229                 os * SIMD;
 230             dotProd[0][os] += partial_acc_ptr[partial_acc_ptr_idx];
 231         }
 232     }
 233 #endif
 234
 235 #if FEATURE_SLM_SPLIT == 1
 236 #   define FINAL_OUT_BLOCK_FEATURES (OUT_BLOCK_FEATURES)
 237 #else
 238 #   define FINAL_OUT_BLOCK_FEATURES 1
 239     out_f += get_sub_group_id() * SIMD;
 240     out_fg += get_sub_group_id() * SIMD;
 241
 242     if (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OUT_BLOCK_FEATURES != 0 && out_fg >= OUTPUT_FEATURE_NUM)
 243         return;
 244 #endif
 245
 246 #if BIAS_TERM
 247     // Preload bias
 248     BIAS_TYPE bias_val[FINAL_OUT_BLOCK_FEATURES];
 249     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 250         bias_val[ofb] = biases[out_f + ofb * SIMD];
 251     }
 252 #endif
 253
 254     // Convert accumulator type to activation type
 255     ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
 256     __attribute__((opencl_unroll_hint))
 257     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 258         __attribute__((opencl_unroll_hint))
 259         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 260             dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
 261
 262 #if BIAS_TERM
 263             dequantized[ofb][os] += TO_ACTIVATION_TYPE(bias_val[ofb]);
 264 #endif
 265         }
 266     }
 267
 268     // Fused ops/activation
 269     OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
 270     __attribute__((opencl_unroll_hint))
 271     for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 272 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
 273         FUSED_OPS_PRELOAD_SCALAR;
 274 #endif
 275         __attribute__((opencl_unroll_hint))
 276         for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 277 #if HAS_FUSED_OPS
 278     #if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
 279             FUSED_OPS_CALC_SCALAR;
 280     #else
 281             FUSED_OPS_SCALAR;
 282     #endif
 283             result[ofb][os] = FUSED_OPS_RESULT_SCALAR;
 284 #else
 285             result[ofb][os] = TO_OUTPUT_TYPE(ACTIVATION(dequantized[ofb][os], ACTIVATION_PARAMS));
 286 #endif
 287         }
 288     }
 289
 290     // Store output
 291     // Check if can use block writes
 292     bool only_x_block = OUTPUT_SIZE_X % OUT_BLOCK_SPATIAL == 0;
 293     bool at_least_one_x_block = OUTPUT_SIZE_X >= OUT_BLOCK_SPATIAL;
 294     bool full_x = out_yx_sg % OUTPUT_SIZE_X <= OUTPUT_SIZE_X - OUT_BLOCK_SPATIAL;
 295     bool can_write_x = only_x_block || (at_least_one_x_block && full_x);
 296
 297     bool no_x_pad = OUTPUT_PAD_BEFORE_SIZE_X == 0 && OUTPUT_PAD_AFTER_SIZE_X == 0;
 298     bool exact_spatial = max_out_yx % OUT_BLOCK_SPATIAL == 0;
 299     bool full_spatial = out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL;
 300     bool can_write_spatial = no_x_pad && (exact_spatial || full_spatial);
 301
 302     bool full_feature_block = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM);
 303
 304     bool can_use_full_block_write =  full_feature_block && (can_write_x || can_write_spatial);
 305     if (can_use_full_block_write) {
 306         uint output_idx = OUTPUT_GET_INDEX(out_b,
 307                                            out_fg,
 308                                            intel_sub_group_shuffle(out_y_shuffle[0], 0),
 309                                            intel_sub_group_shuffle(out_x_shuffle[0], 0));
 310         __attribute__((opencl_unroll_hint))
 311         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 312             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
 313                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
 314                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
 315             if (good_of_block) {
 316                 uint os = 0;
 317 #if OUTPUT_TYPE_SIZE == 1
 318                 for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
 319                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
 320                     __attribute__((opencl_unroll_hint))
 321                     for (uint i = 0; i < 8; ++i) {
 322                         result_val[i] = result[ofb][os + i];
 323                     }
 324                     DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
 325                     output_idx += 8 * SIMD;
 326                 }
 327 #endif
 328 #if OUTPUT_TYPE_SIZE <= 2
 329                 for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
 330                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
 331                     __attribute__((opencl_unroll_hint))
 332                     for (uint i = 0; i < 4; ++i) {
 333                         result_val[i] = result[ofb][os + i];
 334                     }
 335                     DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
 336                     output_idx += 4 * SIMD;
 337                 }
 338 #endif
 339                 for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
 340                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
 341                     __attribute__((opencl_unroll_hint))
 342                     for (uint i = 0; i < 2; ++i) {
 343                         result_val[i] = result[ofb][os + i];
 344                     }
 345                     DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
 346                     output_idx += 2 * SIMD;
 347                 }
 348                 if (OUT_BLOCK_SPATIAL % 2 == 1) {
 349                     OUTPUT_TYPE result_val = result[ofb][os];
 350                     DT_OUTPUT_BLOCK_WRITE(output, output_idx, result_val);
 351                     output_idx += 1 * SIMD;
 352                 }
 353             }
 354             output_idx += OUTPUT_FEATURE_PITCH * FSV - OUT_BLOCK_SPATIAL * SIMD;
 355         }
 356     } else {
 357         uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
 358         __attribute__((opencl_unroll_hint))
 359         for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 360             output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
 361         }
 362         __attribute__((opencl_unroll_hint))
 363         for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 364             bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
 365                                || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
 366                                || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
 367             if (good_of_block) {
 368                 __attribute__((opencl_unroll_hint))
 369                 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 370                     bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
 371                     if (!good_os)
 372                         break;
 373
 374                     uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
 375                     bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
 376
 377                     if (!good_of)
 378                         result[ofb][os] = (OUTPUT_TYPE)0;
 379
 380                     output[output_idx + sglid] = result[ofb][os];
 381                 }
 382             }
 383
 384             __attribute__((opencl_unroll_hint))
 385             for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
 386                 output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
 387             }
 388         }
 389     }
 390
 391 #undef FINAL_OUT_BLOCK_FEATURES
 392 }
 393
 394 #undef AS_INPUT0_TYPE_4
 395 #undef AS_FILTER_TYPE_4
 396 #undef AS_TYPE_N
 397 #undef AS_TYPE_N_
 398
 399 #undef CEIL_DIV
 400 #undef ALIGN
 401
 402 #undef FSV
 403 #undef SIMD