61bbb2f5d55e9058d095f149bcd22a76f4795460
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / convolution / convolution_kernel_bfyx_f16.cpp
1 // Copyright (c) 2016-2019 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "convolution_kernel_bfyx_f16.h"
16 #include "kernel_selector_utils.h"
17 #include <vector>
18 #include <algorithm>
19
20 namespace kernel_selector {
21
22 static const size_t sub_group_size = 16;
23 static const size_t feature_block_size = 16;
24
25 ConvolutionKernel_bfyx_f16::ConvolutionKernel_bfyx_f16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
26     std::vector<size_t> outputBlockWidths = {2, 4, 8};
27     std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
28
29     for (auto w : outputBlockWidths) {
30         for (auto exeMode : executionModes) {
31             autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
32         }
33     }
34 }
35
36 ConvolutionKernel_bfyx_f16::AutoTuneOption ConvolutionKernel_bfyx_f16::GetAutoTuneOptions(const Params& params,
37                                                                                           int autoTuneIndex) const {
38     if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
39         return autoTuneOptions[autoTuneIndex];
40
41     const convolution_params& cp = static_cast<const convolution_params&>(params);
42
43     if (cp.output.X().v > 4)
44         return {8, DEFAULT};
45     else
46         return {2, DEFAULT};
47 }
48
49 ParamsKey ConvolutionKernel_bfyx_f16::GetSupportedKey() const {
50     ParamsKey k;
51     k.EnableInputDataType(Datatype::F16);
52     k.EnableOutputDataType(Datatype::F16);
53     k.EnableInputWeightsType(WeightsType::F16);
54     k.EnableInputDataType(Datatype::F32);
55     k.EnableOutputDataType(Datatype::F32);
56     k.EnableInputWeightsType(WeightsType::F32);
57     k.EnableInputLayout(DataLayout::bfyx_f16);
58     k.EnableOutputLayout(DataLayout::bfyx_f16);
59     k.EnableTensorOffset();
60     k.EnableTensorPitches();
61     k.EnableDilation();
62     k.EnableBiasPerFeature();
63     // TODO Add bias per output support to kernel
64     // k.EnableBiasPerOutput();
65     k.EnableNonBiasTerm();
66     k.EnableSplitSupport();
67     k.EnableBatching();
68     k.EnableDepthwiseSeparableOpt();
69     k.EnableSubGroup();
70     k.EnableSubGroupShort();
71     return k;
72 }
73
74 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_f16::SetDefault(const convolution_params& params,
75                                                                            int autoTuneIndex) const {
76     DispatchData kd = ConvolutionKernelBase::SetDefault(params);
77
78     const auto& out = params.output;
79
80     auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
81     kd.cldnnStyle.blockWidth = autoTune.blockWidth;
82
83     auto x = out.X().v;
84     auto y = out.Y().v;
85     auto f = out.Feature().v;
86     auto b = out.Batch().v;
87
88     kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
89     kd.gws1 = Align(f, sub_group_size);
90     kd.gws2 = b;
91
92     kd.lws0 = 1;
93     kd.lws1 = sub_group_size;
94     kd.lws2 = 1;
95
96     if (b == 1)
97         kd.effiency = FORCE_PRIORITY_2;
98     else
99         kd.effiency = FORCE_PRIORITY_7;
100
101     return kd;
102 }
103
104 bool ConvolutionKernel_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
105     if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
106         return false;
107     }
108
109     const auto& params = static_cast<const convolution_params&>(p);
110
111     const auto& input = params.inputs[0];
112     const auto& output = params.output;
113
114     // Check that padding before features doesn't miss-align the blocks
115     if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
116         return false;
117
118     if (params.groups != 1 || params.split != 1)
119         return false;
120
121     return true;
122 }
123
124 JitConstants ConvolutionKernel_bfyx_f16::GetJitConstants(const convolution_params& params,
125                                                          const DispatchData& runInfo) const {
126     auto input = params.inputs[0];
127     auto output = params.output;
128     auto jit = Parent::GetJitConstants(params, runInfo);
129
130     auto blockWidth = runInfo.cldnnStyle.blockWidth;
131
132     size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1,
133                                       input.X().v + input.X().pad.Total());
134
135     jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
136     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
137     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
138     jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
139     jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(input.Feature().v, feature_block_size)));
140     if (params.output.Feature().v % feature_block_size != 0) {
141         jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
142     }
143     if (params.inputs[0].Feature().v % feature_block_size != 0) {
144         jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
145     }
146
147     return jit;
148 }
149
150 KernelsData ConvolutionKernel_bfyx_f16::GetTunedKernelsDataByIndex(const Params& params,
151                                                                    const optional_params& options,
152                                                                    const int autoTuneIndex) const {
153     auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
154     return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
155 }
156
157 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsData(const Params& params, const optional_params& options) const {
158     return GetTunedKernelsDataByIndex(params, options);
159 }
160
161 KernelsData ConvolutionKernel_bfyx_f16::GetKernelsDataForAutoTune(const Params& params,
162                                                                   const optional_params& options) const {
163     if (!Validate(params, options)) {
164         return {};
165     }
166
167     KernelsData res = {};
168
169     for (size_t i = 0; i < autoTuneOptions.size(); i++) {
170         KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
171         if (!kd.empty()) {
172             res.emplace_back(kd[0]);
173         }
174     }
175
176     return res;
177 }
178
179 JitConstants ConvolutionKernel_bfyx_f16::GetFusedPrimitivesJitConstants(const convolution_params& params,
180                                                                         const DispatchData& kd) const {
181     JitConstants jit = {};
182
183     size_t op_id = 0;
184     std::string input_decls = "";
185     std::string load_decls_vec = "";
186     std::string load_decls = "";
187     std::string eltwise_fused_ops_vec = "";
188     std::string eltwise_fused_ops = "";
189
190     auto make_jit_vector_type = [](std::string tensor_name, size_t vec_size) -> std::string {
191         if (vec_size == 0 || vec_size > 8)
192             throw std::invalid_argument("Invalid vector size in jit definitions");
193         if (vec_size > 1)
194             return "MAKE_VECTOR_TYPE(" + tensor_name + "_TYPE," + std::to_string(vec_size) + ")";
195         else
196             return tensor_name + "_TYPE";
197     };
198
199     auto make_jit_load = [](std::string tensor_name, std::string ptr_name, size_t vec_size) -> std::string {
200         if (vec_size == 0 || vec_size > 8)
201             throw std::invalid_argument("Invalid vector size in jit definitions");
202
203         std::string index_func_call_vec = tensor_name + "_GET_INDEX(b, f_block*16, y, x)";
204         std::string index_func_call = tensor_name + "_GET_INDEX(b, f_block*16, y, x+i)";
205         if (vec_size > 1)
206             return " UNIT_BLOCK_READ" + std::to_string(vec_size) + "(" + ptr_name + ", " + index_func_call_vec + ")";
207         else
208             return " UNIT_BLOCK_READ(" + ptr_name + ", " + index_func_call + ")";
209     };
210
211     for (auto& fused_dep : params.fused_ops) {
212         std::string op_type = "";
213         switch (fused_dep.type) {
214             case convolution_params::fused_operation_desc::Type::ELTWISE: {
215                 op_type = "eltwise";
216                 eltwise_fused_ops_vec += "dst = (dst + " + op_type + "_data);";
217                 eltwise_fused_ops += "dst[i] = (dst[i] + " + op_type + "_data);";
218                 break;
219             }
220             default:
221                 throw std::invalid_argument("Invalid fused op in binary_convolution kernel: " + params.layerID);
222         }
223
224         for (size_t op_input_id = 0; op_input_id < fused_dep.tensors.size(); op_input_id++) {
225             std::string name = "FUSED_OP_" + std::to_string(op_id) + "_INPUT" + std::to_string(op_input_id);
226             std::string ptr_name = op_type + "_input" + std::to_string(op_input_id);
227
228             std::string var_name = op_type + "_data";
229             jit.AddConstant(MakeJitConstant(name, fused_dep.tensors[op_input_id]));
230             input_decls += "const __global " + toCLType(fused_dep.tensors[op_input_id].GetDType()) +
231                            "* " + ptr_name + ",";
232             load_decls_vec += make_jit_vector_type(name, kd.cldnnStyle.blockWidth) + " " + var_name + " = " +
233                               make_jit_load(name, ptr_name, kd.cldnnStyle.blockWidth) + ";";
234             load_decls += make_jit_vector_type(name, 1) + " " + var_name + " = " +
235                           make_jit_load(name, ptr_name, 1) + ";";
236         }
237
238         if (fused_dep.activation.function != ActivationFunction::NONE) {
239             std::string temp_op_type = op_type;
240             for (auto& ch : temp_op_type)
241                 ch = static_cast<char>(std::toupper(static_cast<unsigned char>(ch)));
242             std::string suffix = "_" + temp_op_type;
243
244             jit.Merge(MakeActivationJitConstants(fused_dep.activation, suffix));
245             eltwise_fused_ops_vec += "dst = ACTIVATION"+suffix+"(dst, ACTIVATION_PARAMS"+suffix+");";
246             eltwise_fused_ops += "dst[i] = ACTIVATION"+suffix+"(dst[i], ACTIVATION_PARAMS"+suffix+");";
247         }
248         op_id++;
249     }
250     jit.AddConstant(MakeJitConstant("FUSED_OPS_DECLS", input_decls));
251     jit.AddConstant(MakeJitConstant("FUSED_OPS_LOAD_DATA_VEC", load_decls_vec));
252     jit.AddConstant(MakeJitConstant("FUSED_OPS_LOAD_DATA", load_decls));
253     jit.AddConstant(MakeJitConstant("DO_ELTWISE_FUSED_OPS_VEC", eltwise_fused_ops_vec));
254     jit.AddConstant(MakeJitConstant("DO_ELTWISE_FUSED_OPS", eltwise_fused_ops));
255
256     return jit;
257 }
258
259 }  // namespace kernel_selector