inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp

   1 // Copyright (c) 2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "convolution_kernel_fs_byx_fsv32_depthwise.h"
  16 #include <vector>
  17
  18 namespace kernel_selector {
  19
  20 static constexpr size_t subGroupSize = 16;
  21 static constexpr size_t fsv = 32;
  22 static constexpr size_t fsvPerThread = fsv / subGroupSize;
  23
  24 ConvolutionKernel_fs_byx_fsv32_depthwise::ConvolutionKernel_fs_byx_fsv32_depthwise()
  25     : ConvolutionKernelBase("convolution_gpu_fs_byx_fsv32_depthwise") {
  26     std::vector<size_t> blockWidths = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
  27     std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
  28
  29     for (auto w : blockWidths) {
  30         for (auto exeMode : executionModes) {
  31             autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
  32         }
  33     }
  34 }
  35
  36 ParamsKey ConvolutionKernel_fs_byx_fsv32_depthwise::GetSupportedKey() const {
  37     ParamsKey k;
  38     k.EnableInputDataType(Datatype::F16);
  39     k.EnableOutputDataType(Datatype::F16);
  40     k.EnableInputWeightsType(WeightsType::F16);
  41     k.EnableInputLayout(DataLayout::fs_b_yx_fsv32);
  42     k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32);
  43     k.EnableBiasPerFeature();
  44     k.EnableBiasPerOutput();
  45     k.EnableNonBiasTerm();
  46     k.EnableBatching();
  47     k.EnableDilation();
  48     k.EnableTensorOffset();
  49     k.EnableTensorPitches();
  50     k.EnableDepthwiseSeparableOpt();
  51     k.EnableGroupedConvolution();
  52     return k;
  53 }
  54
  55 size_t ConvolutionKernel_fs_byx_fsv32_depthwise::getInputWidth(const convolution_params& arg, size_t blockWidth) const {
  56     return (blockWidth - 1) * arg.stride.x + (arg.filterSize.x - 1) * arg.dilation.x + 1;
  57 }
  58
  59 size_t ConvolutionKernel_fs_byx_fsv32_depthwise::getMinRegisterUsage(const convolution_params& arg, size_t blockWidth) const {
  60     size_t weightsRegisters = 2;
  61     size_t outputRegisters = blockWidth * 2;
  62     size_t inputRegisters = getInputWidth(arg, blockWidth) * 2;
  63
  64     return weightsRegisters + outputRegisters + inputRegisters;
  65 }
  66
  67 ConvolutionKernel_fs_byx_fsv32_depthwise::AutoTuneOption ConvolutionKernel_fs_byx_fsv32_depthwise::GetAutoTuneOptions(
  68     const Params& arg,
  69     int autoTuneIndex) const {
  70     if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
  71         return autoTuneOptions[autoTuneIndex];
  72
  73     const convolution_params& cp = static_cast<const convolution_params&>(arg);
  74
  75     const size_t regThreshold = 64;
  76
  77     std::vector<size_t> nonOptBlockWidths = {3, 2, 1};  // This will most likely be memory bound
  78     std::vector<size_t> optBlockWidths = {8, 7, 6, 5, 4};
  79
  80     // Check if output can be evenly divided into large blocks
  81     for (auto w : optBlockWidths) {
  82         if (cp.output.X().v % w == 0 && getMinRegisterUsage(cp, w) < regThreshold)
  83             return {w, AGE_BASED};
  84     }
  85
  86     // Try to find large blocks with smallest offset
  87     size_t minLeftover = static_cast<size_t>(-1);
  88     size_t foundWidth = 0;
  89     for (auto w : optBlockWidths) {
  90         if (getMinRegisterUsage(cp, w) < regThreshold && Pad(cp.output.X().v, w) < minLeftover) {
  91             minLeftover = Pad(cp.output.X().v, w);
  92             foundWidth = w;
  93         }
  94     }
  95
  96     if (foundWidth != 0)
  97         return {foundWidth, AGE_BASED};
  98
  99     // Check small and memory bound block sizes
 100     for (auto w : nonOptBlockWidths) {
 101         if (cp.output.X().v % w == 0 && getMinRegisterUsage(cp, w) < regThreshold)
 102             return {w, AGE_BASED};
 103     }
 104
 105     // This means all previous block sizes consumed too much registers, fallback to block width = 1
 106     return {1, AGE_BASED};
 107 }
 108
 109 ConvolutionKernelBase::DispatchData ConvolutionKernel_fs_byx_fsv32_depthwise::SetDefault(const convolution_params& arg,
 110                                                                                int autoTuneIndex) const {
 111     DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
 112
 113     AutoTuneOption option = GetAutoTuneOptions(arg, autoTuneIndex);
 114
 115     runInfo.effiency = FORCE_PRIORITY_3;
 116
 117     runInfo.cldnnStyle.blockHeight = 1;
 118     runInfo.cldnnStyle.blockWidth = option.blockWidth;
 119     runInfo.cldnnStyle.inputBlockWidth = getInputWidth(arg, option.blockWidth);
 120
 121     runInfo.lws0 = 1;
 122     runInfo.lws1 = 1;
 123     runInfo.lws2 = 16;
 124
 125     runInfo.gws0 = CeilDiv(arg.output.X().v, option.blockWidth);
 126     runInfo.gws1 = arg.output.Y().v;
 127     runInfo.gws2 = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
 128
 129     return runInfo;
 130 }
 131
 132 bool ConvolutionKernel_fs_byx_fsv32_depthwise::Validate(const Params& p, const optional_params& o) const {
 133     if (!ConvolutionKernelBase::Validate(p, o))
 134         return false;
 135
 136     auto cp = static_cast<const convolution_params&>(p);
 137     if (cp.groups < 16)
 138         return false;
 139
 140     if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups)
 141         return false;
 142
 143     // Output feature padding must be multiple of fsv to keep block alignment
 144     if (cp.output.Feature().pad.before % fsv != 0)
 145         return false;
 146
 147     return true;
 148 }
 149
 150 JitConstants ConvolutionKernel_fs_byx_fsv32_depthwise::GetJitConstants(const convolution_params& params,
 151                                                              const DispatchData& kd) const {
 152     auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
 153
 154     jit.AddConstant(MakeJitConstant("INPUT_BLOCK_WIDTH", kd.cldnnStyle.inputBlockWidth));
 155     jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
 156     jit.AddConstant(MakeJitConstant("FSV", fsv));
 157     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subGroupSize));
 158     jit.AddConstant(MakeJitConstant("FSV_PER_THREAD", fsvPerThread));
 159
 160     return jit;
 161 }
 162
 163 KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetTunedKernelsDataByIndex(const Params& params,
 164                                                                        const optional_params& options,
 165                                                                        const int autoTuneIndex) const {
 166     auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
 167     return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
 168 }
 169
 170 KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetKernelsData(const Params& params, const optional_params& options) const {
 171     return GetTunedKernelsDataByIndex(params, options);
 172 }
 173
 174 KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetKernelsDataForAutoTune(const Params& params,
 175                                                                       const optional_params& options) const {
 176     if (!Validate(params, options)) {
 177         return {};
 178     }
 179
 180     KernelsData res = {};
 181
 182     for (size_t i = 0; i < autoTuneOptions.size(); i++) {
 183         KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
 184         if (!kd.empty()) {
 185             res.emplace_back(kd[0]);
 186         }
 187     }
 188
 189     return res;
 190 }
 191
 192 }  // namespace kernel_selector