5a60d585eedfee1e8849dac6036f4c10d2f51260
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / convolution / convolution_kernel_bfyx_f16.cpp
1 // Copyright (c) 2016-2019 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "convolution_kernel_bfyx_f16.h"
16 #include "kernel_selector_utils.h"
17 #include <vector>
18 #include <algorithm>
19
20 namespace kernel_selector {
21
22 static const size_t sub_group_size = 16;
23 static const size_t feature_block_size = 16;
24
25 ConvolutionKernel_bfyx_f16::ConvolutionKernel_bfyx_f16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
26     std::vector<size_t> outputBlockWidths = {2, 4, 8};
27     std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
28
29     for (auto w : outputBlockWidths) {
30         for (auto exeMode : executionModes) {
31             autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
32         }
33     }
34 }
35
36 ConvolutionKernel_bfyx_f16::AutoTuneOption ConvolutionKernel_bfyx_f16::GetAutoTuneOptions(const Params& params,
37                                                                                           int autoTuneIndex) const {
38     if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
39         return autoTuneOptions[autoTuneIndex];
40
41     const convolution_params& cp = static_cast<const convolution_params&>(params);
42
43     if (cp.output.X().v > 4)
44         return {8, DEFAULT};
45     else
46         return {2, DEFAULT};
47 }
48
49 ParamsKey ConvolutionKernel_bfyx_f16::GetSupportedKey() const {
50     ParamsKey k;
51     k.EnableInputDataType(Datatype::F16);
52     k.EnableOutputDataType(Datatype::F16);
53     k.EnableInputWeightsType(WeightsType::F16);
54     k.EnableInputDataType(Datatype::F32);
55     k.EnableOutputDataType(Datatype::F32);
56     k.EnableInputWeightsType(WeightsType::F32);
57     k.EnableInputLayout(DataLayout::bfyx_f16);
58     k.EnableOutputLayout(DataLayout::bfyx_f16);
59     k.EnableTensorOffset();
60     k.EnableTensorPitches();
61     k.EnableDilation();
62     k.EnableBiasPerFeature();
63     // TODO Add bias per output support to kernel
64     // k.EnableBiasPerOutput();
65     k.EnableNonBiasTerm();
66     k.EnableSplitSupport();
67     k.EnableBatching();
68     k.EnableDepthwiseSeparableOpt();
69     k.EnableSubGroup();
70     k.EnableSubGroupShort();
71     return k;
72 }
73
74 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_f16::SetDefault(const convolution_params& params,
75                                                                            int autoTuneIndex) const {
76     DispatchData kd = ConvolutionKernelBase::SetDefault(params);
77
78     const auto& out = params.output;
79
80     auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
81     kd.cldnnStyle.blockWidth = autoTune.blockWidth;
82
83     auto x = out.X().v;
84     auto y = out.Y().v;
85     auto f = out.Feature().v;
86     auto b = out.Batch().v;
87
88     kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
89     kd.gws1 = Align(f, sub_group_size);
90     kd.gws2 = b;
91
92     kd.lws0 = 1;
93     kd.lws1 = sub_group_size;
94     kd.lws2 = 1;
95
96     if (b == 1)
97         kd.effiency = FORCE_PRIORITY_2;
98     else
99         kd.effiency = FORCE_PRIORITY_7;
100
101     return kd;
102 }
103
104 bool ConvolutionKernel_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
105     if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
106         return false;
107     }
108
109     const auto& params = static_cast<const convolution_params&>(p);
110
111     const auto& input = params.inputs[0];
112     const auto& output = params.output;
113
114     // Check that padding before features doesn't miss-align the blocks
115     if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
116         return false;
117
118     if (params.groups != 1 || params.split != 1)
119         return false;
120
121     return true;
122 }
123
124 JitConstants ConvolutionKernel_bfyx_f16::GetJitConstants(const convolution_params& params,
125                                                          const DispatchData& runInfo) const {
126     auto input = params.inputs[0];
127     auto output = params.output;
128     auto jit = Parent::GetJitConstants(params, runInfo);
129
130     auto blockWidth = runInfo.cldnnStyle.blockWidth;
131     if (params.fused_ops.size() > 0) {
132         FusedOpsConfiguration conf_vec = {"_VEC", {"b", "(f_block*16)", "y", "x"}, "dst", blockWidth, true, false, true, false };
133         FusedOpsConfiguration conf_scalar = {"_SCALAR", {"b", "(f_block*16)", "y", "(x+i)"}, "dst[i]", 1, true, false, true, false };
134         jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
135         jit.Merge(MakeTypeJitConstants(Datatype::F32, "float"));
136         jit.Merge(MakeTypeJitConstants(Datatype::F16, "half"));
137     }
138
139     size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1,
140                                       input.X().v + input.X().pad.Total());
141
142     jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
143     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
144     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
145     jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
146     jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(input.Feature().v, feature_block_size)));
147     if (params.output.Feature().v % feature_block_size != 0) {
148         jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
149     }
150     if (params.inputs[0].Feature().v % feature_block_size != 0) {
151         jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
152     }
153
154     return jit;
155 }
156
157 KernelsData ConvolutionKernel_bfyx_f16::GetTunedKernelsDataByIndex(const Params& params,
158                                                                    const optional_params& options,
159                                                                    const int autoTuneIndex) const {
160     auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
161     return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
162 }
163
164 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsData(const Params& params, const optional_params& options) const {
165     return GetTunedKernelsDataByIndex(params, options);
166 }
167
168 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsDataForAutoTune(const Params& params,
169                                                                   const optional_params& options) const {
170     if (!Validate(params, options)) {
171         return {};
172     }
173
174     KernelsData res = {};
175
176     for (size_t i = 0; i < autoTuneOptions.size(); i++) {
177         KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
178         if (!kd.empty()) {
179             res.emplace_back(kd[0]);
180         }
181     }
182
183     return res;
184 }
185
186 }  // namespace kernel_selector