1 // Copyright (c) 2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "convolution_kernel_fs_byx_fsv32_depthwise.h"
18 namespace kernel_selector {
20 static constexpr size_t subGroupSize = 16;
21 static constexpr size_t fsv = 32;
22 static constexpr size_t fsvPerThread = fsv / subGroupSize;
24 ConvolutionKernel_fs_byx_fsv32_depthwise::ConvolutionKernel_fs_byx_fsv32_depthwise()
25 : ConvolutionKernelBase("convolution_gpu_fs_byx_fsv32_depthwise") {
26 std::vector<size_t> blockWidths = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
27 std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
29 for (auto w : blockWidths) {
30 for (auto exeMode : executionModes) {
31 autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
36 ParamsKey ConvolutionKernel_fs_byx_fsv32_depthwise::GetSupportedKey() const {
38 k.EnableInputDataType(Datatype::F16);
39 k.EnableOutputDataType(Datatype::F16);
40 k.EnableInputWeightsType(WeightsType::F16);
41 k.EnableInputLayout(DataLayout::fs_b_yx_fsv32);
42 k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32);
43 k.EnableBiasPerFeature();
44 k.EnableBiasPerOutput();
45 k.EnableNonBiasTerm();
48 k.EnableTensorOffset();
49 k.EnableTensorPitches();
50 k.EnableDepthwiseSeparableOpt();
51 k.EnableGroupedConvolution();
55 size_t ConvolutionKernel_fs_byx_fsv32_depthwise::getInputWidth(const convolution_params& arg, size_t blockWidth) const {
56 return (blockWidth - 1) * arg.stride.x + (arg.filterSize.x - 1) * arg.dilation.x + 1;
59 size_t ConvolutionKernel_fs_byx_fsv32_depthwise::getMinRegisterUsage(const convolution_params& arg, size_t blockWidth) const {
60 size_t weightsRegisters = 2;
61 size_t outputRegisters = blockWidth * 2;
62 size_t inputRegisters = getInputWidth(arg, blockWidth) * 2;
64 return weightsRegisters + outputRegisters + inputRegisters;
67 ConvolutionKernel_fs_byx_fsv32_depthwise::AutoTuneOption ConvolutionKernel_fs_byx_fsv32_depthwise::GetAutoTuneOptions(
69 int autoTuneIndex) const {
70 if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
71 return autoTuneOptions[autoTuneIndex];
73 const convolution_params& cp = static_cast<const convolution_params&>(arg);
75 const size_t regThreshold = 64;
77 std::vector<size_t> nonOptBlockWidths = {3, 2, 1}; // This will most likely be memory bound
78 std::vector<size_t> optBlockWidths = {8, 7, 6, 5, 4};
80 // Check if output can be evenly divided into large blocks
81 for (auto w : optBlockWidths) {
82 if (cp.output.X().v % w == 0 && getMinRegisterUsage(cp, w) < regThreshold)
83 return {w, AGE_BASED};
86 // Try to find large blocks with smallest offset
87 size_t minLeftover = static_cast<size_t>(-1);
88 size_t foundWidth = 0;
89 for (auto w : optBlockWidths) {
90 if (getMinRegisterUsage(cp, w) < regThreshold && Pad(cp.output.X().v, w) < minLeftover) {
91 minLeftover = Pad(cp.output.X().v, w);
97 return {foundWidth, AGE_BASED};
99 // Check small and memory bound block sizes
100 for (auto w : nonOptBlockWidths) {
101 if (cp.output.X().v % w == 0 && getMinRegisterUsage(cp, w) < regThreshold)
102 return {w, AGE_BASED};
105 // This means all previous block sizes consumed too much registers, fallback to block width = 1
106 return {1, AGE_BASED};
109 ConvolutionKernelBase::DispatchData ConvolutionKernel_fs_byx_fsv32_depthwise::SetDefault(const convolution_params& arg,
110 int autoTuneIndex) const {
111 DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
113 AutoTuneOption option = GetAutoTuneOptions(arg, autoTuneIndex);
115 runInfo.effiency = FORCE_PRIORITY_3;
117 runInfo.cldnnStyle.blockHeight = 1;
118 runInfo.cldnnStyle.blockWidth = option.blockWidth;
119 runInfo.cldnnStyle.inputBlockWidth = getInputWidth(arg, option.blockWidth);
125 runInfo.gws0 = CeilDiv(arg.output.X().v, option.blockWidth);
126 runInfo.gws1 = arg.output.Y().v;
127 runInfo.gws2 = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
132 bool ConvolutionKernel_fs_byx_fsv32_depthwise::Validate(const Params& p, const optional_params& o) const {
133 if (!ConvolutionKernelBase::Validate(p, o))
136 auto cp = static_cast<const convolution_params&>(p);
140 if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups)
143 // Output feature padding must be multiple of fsv to keep block alignment
144 if (cp.output.Feature().pad.before % fsv != 0)
150 JitConstants ConvolutionKernel_fs_byx_fsv32_depthwise::GetJitConstants(const convolution_params& params,
151 const DispatchData& kd) const {
152 auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
154 jit.AddConstant(MakeJitConstant("INPUT_BLOCK_WIDTH", kd.cldnnStyle.inputBlockWidth));
155 jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
156 jit.AddConstant(MakeJitConstant("FSV", fsv));
157 jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subGroupSize));
158 jit.AddConstant(MakeJitConstant("FSV_PER_THREAD", fsvPerThread));
163 KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetTunedKernelsDataByIndex(const Params& params,
164 const optional_params& options,
165 const int autoTuneIndex) const {
166 auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
167 return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
170 KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetKernelsData(const Params& params, const optional_params& options) const {
171 return GetTunedKernelsDataByIndex(params, options);
174 KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetKernelsDataForAutoTune(const Params& params,
175 const optional_params& options) const {
176 if (!Validate(params, options)) {
180 KernelsData res = {};
182 for (size_t i = 0; i < autoTuneOptions.size(); i++) {
183 KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
185 res.emplace_back(kd[0]);
192 } // namespace kernel_selector