1 // Copyright (c) 2016-2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "convolution_kernel_bfyx_f16.h"
16 #include "kernel_selector_utils.h"
20 namespace kernel_selector {
22 static const size_t sub_group_size = 16;
23 static const size_t feature_block_size = 16;
25 ConvolutionKernel_bfyx_f16::ConvolutionKernel_bfyx_f16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
26 std::vector<size_t> outputBlockWidths = {2, 4, 8};
27 std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
29 for (auto w : outputBlockWidths) {
30 for (auto exeMode : executionModes) {
31 autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
36 ConvolutionKernel_bfyx_f16::AutoTuneOption ConvolutionKernel_bfyx_f16::GetAutoTuneOptions(const Params& params,
37 int autoTuneIndex) const {
38 if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
39 return autoTuneOptions[autoTuneIndex];
41 const convolution_params& cp = static_cast<const convolution_params&>(params);
43 if (cp.output.X().v > 4)
49 ParamsKey ConvolutionKernel_bfyx_f16::GetSupportedKey() const {
51 k.EnableInputDataType(Datatype::F16);
52 k.EnableOutputDataType(Datatype::F16);
53 k.EnableInputWeightsType(WeightsType::F16);
54 k.EnableInputDataType(Datatype::F32);
55 k.EnableOutputDataType(Datatype::F32);
56 k.EnableInputWeightsType(WeightsType::F32);
57 k.EnableInputLayout(DataLayout::bfyx_f16);
58 k.EnableOutputLayout(DataLayout::bfyx_f16);
59 k.EnableTensorOffset();
60 k.EnableTensorPitches();
62 k.EnableBiasPerFeature();
63 // TODO Add bias per output support to kernel
64 // k.EnableBiasPerOutput();
65 k.EnableNonBiasTerm();
66 k.EnableSplitSupport();
68 k.EnableDepthwiseSeparableOpt();
70 k.EnableSubGroupShort();
74 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_f16::SetDefault(const convolution_params& params,
75 int autoTuneIndex) const {
76 DispatchData kd = ConvolutionKernelBase::SetDefault(params);
78 const auto& out = params.output;
80 auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
81 kd.cldnnStyle.blockWidth = autoTune.blockWidth;
85 auto f = out.Feature().v;
86 auto b = out.Batch().v;
88 kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
89 kd.gws1 = Align(f, sub_group_size);
93 kd.lws1 = sub_group_size;
97 kd.effiency = FORCE_PRIORITY_2;
99 kd.effiency = FORCE_PRIORITY_7;
104 bool ConvolutionKernel_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
105 if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
109 const auto& params = static_cast<const convolution_params&>(p);
111 const auto& input = params.inputs[0];
112 const auto& output = params.output;
114 // Check that padding before features doesn't miss-align the blocks
115 if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
118 if (params.groups != 1 || params.split != 1)
124 JitConstants ConvolutionKernel_bfyx_f16::GetJitConstants(const convolution_params& params,
125 const DispatchData& runInfo) const {
126 auto input = params.inputs[0];
127 auto output = params.output;
128 auto jit = Parent::GetJitConstants(params, runInfo);
130 auto blockWidth = runInfo.cldnnStyle.blockWidth;
131 if (params.fused_ops.size() > 0) {
132 FusedOpsConfiguration conf_vec = {"_VEC", {"b", "(f_block*16)", "y", "x"}, "dst", blockWidth, true, false, true, false };
133 FusedOpsConfiguration conf_scalar = {"_SCALAR", {"b", "(f_block*16)", "y", "(x+i)"}, "dst[i]", 1, true, false, true, false };
134 jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
135 jit.Merge(MakeTypeJitConstants(Datatype::F32, "float"));
136 jit.Merge(MakeTypeJitConstants(Datatype::F16, "half"));
139 size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1,
140 input.X().v + input.X().pad.Total());
142 jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
143 jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
144 jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
145 jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
146 jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(input.Feature().v, feature_block_size)));
147 if (params.output.Feature().v % feature_block_size != 0) {
148 jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
150 if (params.inputs[0].Feature().v % feature_block_size != 0) {
151 jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
157 KernelsData ConvolutionKernel_bfyx_f16::GetTunedKernelsDataByIndex(const Params& params,
158 const optional_params& options,
159 const int autoTuneIndex) const {
160 auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
161 return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
164 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsData(const Params& params, const optional_params& options) const {
165 return GetTunedKernelsDataByIndex(params, options);
168 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsDataForAutoTune(const Params& params,
169 const optional_params& options) const {
170 if (!Validate(params, options)) {
174 KernelsData res = {};
176 for (size_t i = 0; i < autoTuneOptions.size(); i++) {
177 KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
179 res.emplace_back(kd[0]);
186 } // namespace kernel_selector