2 // Copyright (c) 2017-2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_kernel_bfyx_3x3_dw_opt.h"
19 namespace kernel_selector
21 ConvolutionKernel_bfyx_3x3_dw_opt::ConvolutionKernel_bfyx_3x3_dw_opt() : ConvolutionKernelBase("convolution_gpu_bfyx_3x3_dw_opt")
23 // Generate the dispatch options to the auto-tuner.
24 std::vector<size_t> tileXDimSizes = { 1,2,4,5,6,8,10,12,14 };
25 std::vector<size_t> tileYDimSizes = { 1,2,3,4,5,6,7 };
26 std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
28 for (auto tileXDim : tileXDimSizes)
30 for (auto tileYDim : tileYDimSizes)
32 for (auto executionMode : executionModes)
34 autoTuneOptions.emplace_back(AutoTuneOption{ {tileXDim, tileYDim}, executionMode });
40 ParamsKey ConvolutionKernel_bfyx_3x3_dw_opt::GetSupportedKey() const
43 k.EnableInputDataType(Datatype::F32);
44 k.EnableInputDataType(Datatype::F16);
45 k.EnableInputWeightsType(WeightsType::F16);
46 k.EnableInputWeightsType(WeightsType::F32);
47 k.EnableOutputDataType(Datatype::F32);
48 k.EnableOutputDataType(Datatype::F16);
49 k.EnableInputLayout(DataLayout::bfyx);
50 k.EnableOutputLayout(DataLayout::bfyx);
51 k.EnableTensorOffset();
52 k.EnableTensorPitches();
53 k.EnableBiasPerFeature();
54 k.EnableNonBiasTerm();
56 k.EnableSplitSupport();
58 k.EnableSubGroupShort();
59 k.EnableDepthwiseSeparableOpt();
63 bool ConvolutionKernel_bfyx_3x3_dw_opt::Validate(const Params& p, const optional_params& o) const
65 if (!ConvolutionKernelBase::Validate(p, o) ||
66 !CovolutionCheckInput(p, o))
71 const convolution_params& cp = static_cast<const convolution_params&>(p);
73 if ((cp.filterSize.x != 3) ||
74 (cp.filterSize.y != 3) ||
77 (cp.padding.x != 1) ||
78 (cp.padding.y != 1) ||
79 (cp.inputs[0].Feature().v != cp.split) ||
80 cp.output.PitchesDifferFromLogicalDims())
88 ConvolutionKernel_bfyx_3x3_dw_opt::AutoTuneOption ConvolutionKernel_bfyx_3x3_dw_opt::GetAutoTuneOptions(const Params&, int autoTuneIndex) const
90 if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
92 return autoTuneOptions[autoTuneIndex];
95 constexpr int simdSize = 16;
97 return AutoTuneOption{ { simdSize - 2, 7 }, DEFAULT };
100 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_3x3_dw_opt::SetDefault(const convolution_params& params, int autoTuneIndex) const
102 constexpr int simdSize = 16;
104 DispatchData runInfo = Parent::SetDefault(params);
106 auto options = GetAutoTuneOptions(params, autoTuneIndex);
108 const int numTilesX = static_cast<int>(std::ceil(static_cast<float>(params.inputs[0].X().v) / static_cast<float>(options.tileDims.x)));
109 const int numTilesY = static_cast<int>(std::ceil(static_cast<float>(params.inputs[0].Y().v) / static_cast<float>(options.tileDims.y)));
111 runInfo.cldnnStyle.blockWidth = options.tileDims.x;
112 runInfo.cldnnStyle.blockHeight = options.tileDims.y;
113 runInfo.gws0 = numTilesX * simdSize;
114 runInfo.gws1 = numTilesY;
115 runInfo.gws2 = params.inputs[0].Feature().v * params.inputs[0].Batch().v;
116 runInfo.lws0 = simdSize;
120 runInfo.effiency = FORCE_PRIORITY_5;
125 JitConstants ConvolutionKernel_bfyx_3x3_dw_opt::GetJitConstants(const convolution_params& params, const DispatchData& kd) const
127 stSize tileDims = { kd.cldnnStyle.blockWidth, kd.cldnnStyle.blockHeight };
128 auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, kd);
130 if (tileDims.y != 0 && tileDims.x != 0)
132 mem_consts.AddConstant(MakeJitConstant("UNIT_BYTE_SIZE", kd.fp16UnitUsed ? sizeof(short) : sizeof(float)));
133 mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0));
134 mem_consts.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y));
135 mem_consts.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x));
141 KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const
143 constexpr int simdSize = 16;
145 KernelData kd = KernelData::Default<convolution_params>(params);
146 convolution_params& convParams = *static_cast<convolution_params*>(kd.params.get());
147 DispatchData runInfo = SetDefault(convParams, autoTuneIndex);
149 if (static_cast<int>(static_cast<int>(runInfo.gws0 - 1) / simdSize) * runInfo.cldnnStyle.blockWidth + simdSize > convParams.inputs[0].Y().pitch)
151 // Internal Error - requested tile size is not supported for y pitch
155 return GetCommonKernelsData(params, options, GetAutoTuneOptions(params, autoTuneIndex).exeMode, autoTuneIndex);
158 KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetKernelsData(const Params& params, const optional_params& options) const
160 return GetTunedKernelsDataByIndex(params, options, -1);
163 KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
165 if (!Validate(params, options))
170 KernelsData res = {};
172 for (size_t i = 0; i < autoTuneOptions.size(); i++)
174 KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
177 res.emplace_back(kd[0]);
181 KernelsData defaultKds = GetKernelsData(params, options);
182 res.insert(res.end(), defaultKds.begin(), defaultKds.end());