inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_f16.cpp

   1 // Copyright (c) 2016-2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "convolution_kernel_bfyx_f16.h"
  16 #include "kernel_selector_utils.h"
  17 #include <vector>
  18 #include <algorithm>
  19
  20 namespace kernel_selector {
  21
  22 static const size_t sub_group_size = 16;
  23 static const size_t feature_block_size = 16;
  24
  25 ConvolutionKernel_bfyx_f16::ConvolutionKernel_bfyx_f16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
  26     std::vector<size_t> outputBlockWidths = {2, 4, 8};
  27     std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
  28
  29     for (auto w : outputBlockWidths) {
  30         for (auto exeMode : executionModes) {
  31             autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
  32         }
  33     }
  34 }
  35
  36 ConvolutionKernel_bfyx_f16::AutoTuneOption ConvolutionKernel_bfyx_f16::GetAutoTuneOptions(const Params& params,
  37                                                                                           int autoTuneIndex) const {
  38     if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
  39         return autoTuneOptions[autoTuneIndex];
  40
  41     const convolution_params& cp = static_cast<const convolution_params&>(params);
  42
  43     if (cp.output.X().v > 4)
  44         return {8, DEFAULT};
  45     else
  46         return {2, DEFAULT};
  47 }
  48
  49 ParamsKey ConvolutionKernel_bfyx_f16::GetSupportedKey() const {
  50     ParamsKey k;
  51     k.EnableInputDataType(Datatype::F16);
  52     k.EnableOutputDataType(Datatype::F16);
  53     k.EnableInputWeightsType(WeightsType::F16);
  54     k.EnableInputDataType(Datatype::F32);
  55     k.EnableOutputDataType(Datatype::F32);
  56     k.EnableInputWeightsType(WeightsType::F32);
  57     k.EnableInputLayout(DataLayout::bfyx_f16);
  58     k.EnableOutputLayout(DataLayout::bfyx_f16);
  59     k.EnableTensorOffset();
  60     k.EnableTensorPitches();
  61     k.EnableDilation();
  62     k.EnableBiasPerFeature();
  63     // TODO Add bias per output support to kernel
  64     // k.EnableBiasPerOutput();
  65     k.EnableNonBiasTerm();
  66     k.EnableSplitSupport();
  67     k.EnableBatching();
  68     k.EnableDepthwiseSeparableOpt();
  69     k.EnableSubGroup();
  70     k.EnableSubGroupShort();
  71     return k;
  72 }
  73
  74 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_f16::SetDefault(const convolution_params& params,
  75                                                                            int autoTuneIndex) const {
  76     DispatchData kd = ConvolutionKernelBase::SetDefault(params);
  77
  78     const auto& out = params.output;
  79
  80     auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
  81     kd.cldnnStyle.blockWidth = autoTune.blockWidth;
  82
  83     auto x = out.X().v;
  84     auto y = out.Y().v;
  85     auto f = out.Feature().v;
  86     auto b = out.Batch().v;
  87
  88     kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
  89     kd.gws1 = Align(f, sub_group_size);
  90     kd.gws2 = b;
  91
  92     kd.lws0 = 1;
  93     kd.lws1 = sub_group_size;
  94     kd.lws2 = 1;
  95
  96     if (b == 1)
  97         kd.effiency = FORCE_PRIORITY_2;
  98     else
  99         kd.effiency = FORCE_PRIORITY_7;
 100
 101     return kd;
 102 }
 103
 104 bool ConvolutionKernel_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
 105     if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
 106         return false;
 107     }
 108
 109     const auto& params = static_cast<const convolution_params&>(p);
 110
 111     const auto& input = params.inputs[0];
 112     const auto& output = params.output;
 113
 114     // Check that padding before features doesn't miss-align the blocks
 115     if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
 116         return false;
 117
 118     if (params.groups != 1 || params.split != 1)
 119         return false;
 120
 121     return true;
 122 }
 123
 124 JitConstants ConvolutionKernel_bfyx_f16::GetJitConstants(const convolution_params& params,
 125                                                          const DispatchData& runInfo) const {
 126     auto input = params.inputs[0];
 127     auto output = params.output;
 128     auto jit = Parent::GetJitConstants(params, runInfo);
 129
 130     auto blockWidth = runInfo.cldnnStyle.blockWidth;
 131     if (params.fused_ops.size() > 0) {
 132         FusedOpsConfiguration conf_vec = {"_VEC", {"b", "(f_block*16)", "y", "x"}, "dst", blockWidth, true, false, true, false };
 133         FusedOpsConfiguration conf_scalar = {"_SCALAR", {"b", "(f_block*16)", "y", "(x+i)"}, "dst[i]", 1, true, false, true, false };
 134         jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
 135         jit.Merge(MakeTypeJitConstants(Datatype::F32, "float"));
 136         jit.Merge(MakeTypeJitConstants(Datatype::F16, "half"));
 137     }
 138
 139     size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1,
 140                                       input.X().v + input.X().pad.Total());
 141
 142     jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
 143     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
 144     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
 145     jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
 146     jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(input.Feature().v, feature_block_size)));
 147     if (params.output.Feature().v % feature_block_size != 0) {
 148         jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
 149     }
 150     if (params.inputs[0].Feature().v % feature_block_size != 0) {
 151         jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
 152     }
 153
 154     return jit;
 155 }
 156
 157 KernelsData ConvolutionKernel_bfyx_f16::GetTunedKernelsDataByIndex(const Params& params,
 158                                                                    const optional_params& options,
 159                                                                    const int autoTuneIndex) const {
 160     auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
 161     return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
 162 }
 163
 164 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsData(const Params& params, const optional_params& options) const {
 165     return GetTunedKernelsDataByIndex(params, options);
 166 }
 167
 168 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsDataForAutoTune(const Params& params,
 169                                                                   const optional_params& options) const {
 170     if (!Validate(params, options)) {
 171         return {};
 172     }
 173
 174     KernelsData res = {};
 175
 176     for (size_t i = 0; i < autoTuneOptions.size(); i++) {
 177         KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
 178         if (!kd.empty()) {
 179             res.emplace_back(kd[0]);
 180         }
 181     }
 182
 183     return res;
 184 }
 185
 186 }  // namespace kernel_selector