Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / fused_conv_eltwise / fused_conv_eltwise_kernel_base.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "fused_conv_eltwise_kernel_base.h"
18 #include "kernel_selector_utils.h"
19 #include "common_tools.h"
20
21 namespace kernel_selector 
22 {
23     std::string fused_conv_eltwise_params::to_string() const
24     {
25         std::stringstream s;
26
27         s << base_params::to_string() << "_";
28         if (bias.empty())
29         {
30             s << "no_bias" << "_";
31         }
32         else
33         {
34             s << "bias_" << bias[0].PhysicalSize() << "_";
35         }
36
37         s << conv.filterSize.x << "_" << conv.filterSize.y << "_";
38         s << conv.stride.x << "_" << conv.stride.y << "_";
39         s << conv.dilation.x << "_" << conv.dilation.y << "_";
40         s << conv.padding.x << "_" << conv.padding.y << "_";
41         s << conv.split;
42
43         return s.str();
44     }
45
46     ParamsKey fused_conv_eltwise_params::GetParamsKey() const
47     {
48         ParamsKey k = weight_bias_params::GetParamsKey();
49
50         if (conv.split > 1)
51         {
52             k.EnableFusedConvEltwSplitSupport();
53         }
54
55         if (conv.dilation.x != 1 ||
56             conv.dilation.y != 1)
57         {
58             k.EnableFusedConvEltwDilation();
59         }
60
61         if (conv.depthwise_separable_opt)
62         {
63             k.EnableFusedConvEltwDepthwiseSeparableOpt();
64         }
65
66         if (conv.transposed)
67         {
68             k.EnableFusedConvEltwTranspose();
69         }
70
71         if (conv.int8_quantization)
72         {
73             k.EnableFusedConvEltwInt8Quantization();
74         }
75
76         if (conv.output_calibration)
77         {
78             k.EnableFusedConvEltwOutputCalibration();
79         }
80
81         if (conv.local_convolution)
82         {
83             k.EnableFusedConvEltwLocalConvolution();
84         }
85
86         if (second_input_in_output)
87         {
88             k.EnableFusedConvEltwiseRWOutOpt();
89         }
90
91         return k;
92     }
93
94     bool fused_conv_eltwise_kernel_base::Validate(const Params& p, const optional_params& o) const
95     {
96         if (p.GetType() != KernelType::FUSED_CONV_ELTWISE ||
97             o.GetType() != KernelType::FUSED_CONV_ELTWISE)
98         {
99             return false;
100         }
101
102         const fused_conv_eltwise_params& params = static_cast<const fused_conv_eltwise_params&>(p);
103         const fused_conv_eltwise_optional_params& optParams = static_cast<const fused_conv_eltwise_optional_params&>(o);
104
105         bool bSupportedWeightsLayout = false;
106
107         for (WeightsLayout l : GetSupportedWeightLayouts(params))
108         {
109             bSupportedWeightsLayout |= params.weights.GetLayout() == l;
110         }
111
112         const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering;
113
114         if (!bWeightsOK)
115         {
116             return false;
117         }
118
119         return true;
120     }
121
122     JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const
123     {
124         JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params);
125         const auto& padding = params.conv.padding;
126         const auto& input = params.inputs[0];
127
128         int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() - padding.x*input.X().pitch - input.Y().pitch*padding.y;
129         input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0);
130
131         mem_consts.AddConstants({
132             MakeJitConstant("STRIDE",                       params.conv.stride),
133             MakeJitConstant("PADDING",                      params.conv.padding),
134             MakeJitConstant("DILATION",                     params.conv.dilation),
135             MakeJitConstant("FILTER_ARRAY_NUM",             params.conv.split),
136             MakeJitConstant("INPUT0_OFFSET_WITH_PADDING",   input_offset_with_padding),
137             MakeJitConstant("DEPTHWISE_SEPARABLE_OPT",      params.conv.depthwise_separable_opt),
138             MakeJitConstant("QUANTIZATION_TERM",            params.conv.int8_quantization),
139         });
140
141         if (params.conv.int8_quantization)
142         {
143             mem_consts.AddConstants({ MakeJitConstant("W_QF", params.conv.weights_quantization_factors[0]) });
144             mem_consts.AddConstants({ MakeJitConstant("I_QF",params.conv.input_quantization_factor) });
145
146             if (params.conv.output_calibration)
147             {
148                 mem_consts.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.conv.output_calibration));
149                 mem_consts.AddConstant(MakeJitConstant("O_QF", params.conv.output_calibration_factors[0]));
150
151             }
152             else
153                 mem_consts.AddConstants({ MakeJitConstant("O_QF", params.conv.output_quantization_factor) });
154         }
155
156         if (params.conv.local_convolution)
157         {
158             mem_consts.AddConstants({ MakeJitConstant("LOCAL_CONVOLUTION", params.conv.local_convolution) });
159         }
160
161         JitConstants eltw_activations = MakeActivationJitConstants(params.eltw.activation, "_ELTW");
162         mem_consts.Merge(eltw_activations);
163
164         mem_consts.AddConstant(MakeJitConstant("IN_OUT_OPT", params.second_input_in_output ? 1 : 0));
165
166         std::vector<uint32_t> unrollLoopParams{
167             params.conv.filterSize.x,
168             params.conv.filterSize.y,
169             (uint32_t)kd.gemmStyle.globalWorkSizeDX,
170             (uint32_t)kd.gemmStyle.globalWorkSizeDY,
171             (uint32_t)kd.gemmStyle.globalWorkSizeDZ,
172             (uint32_t)kd.gemmStyle.subBlockDimM,
173             (uint32_t)kd.gemmStyle.subBlockDimK,
174             (uint32_t)kd.gemmStyle.subBlockDimN
175         };
176
177         auto loopCount = *std::max_element(unrollLoopParams.begin(), unrollLoopParams.end());
178
179         JitConstants mem_consts_loop = MakeLoopUnrollParamsJitConstants(loopCount);
180         mem_consts.Merge(mem_consts_loop);
181
182         return mem_consts;
183     }
184
185     bool fused_conv_eltwise_kernel_base::CheckWorkGroups(const fused_conv_eltwise_kernel_base::DispatchData& kd)
186     {
187         if (kd.gws0 == 0 ||
188             kd.gws1 == 0 ||
189             kd.gws2 == 0 ||
190             kd.lws0 == 0 ||
191             kd.lws1 == 0 ||
192             kd.lws2 == 0)
193         {
194             return false;
195         }
196
197         if ((kd.gws0 % kd.lws0) != 0 ||
198             (kd.gws1 % kd.lws1) != 0 ||
199             (kd.gws2 % kd.lws2) != 0)
200         {
201             return false;
202         }
203
204         return true;
205     }
206
207     namespace
208     {
209         bool CheckTensorForSplit(const DataTensor& t, uint32_t split)
210         {
211             if (t.PitchesDifferFromLogicalDims())
212             {
213                 auto feature = t.Feature();
214                 auto featureIndex = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::FEATURE);
215                 if (featureIndex >= 0 && featureIndex+1 < (int)DataTensor::ChannelsCount(t.GetLayout()))
216                 {
217                     if (feature.v*split <= t.GetDims()[featureIndex+1].pitch)
218                     {
219                         Tensor::NDims newDims = t.GetDims();
220                         newDims[featureIndex].v = feature.v*split;
221                         
222                         DataTensor newTensor{ newDims, t.GetDType(), t.GetLayout(), t.GetViewOffset(), t.PhysicalSize(), t.GetPaddedVal()};
223
224                         if (newTensor.PitchesDifferFromLogicalDims() == false)
225                         {
226                             return true;
227                         }
228                     }
229                 }
230
231                 return false;
232             }
233
234             return true;
235         }
236     }
237
238     bool fused_conv_eltwise_kernel_base::CheckPitchForSplitOnly(const fused_conv_eltwise_params& params)
239     {
240         // TODO: it's better to add pitch+offset support than handle this case
241         return CheckTensorForSplit(params.inputs[0], params.conv.split);
242     }
243
244     fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::SetDefault(const fused_conv_eltwise_params& params, int) const
245     {
246         DispatchData kd;
247
248         const auto& out = params.output;
249         kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
250         std::vector<size_t> global;
251         if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf)
252         {
253             global = { out.X().v, out.Y().v, out.Feature().v*out.Batch().v };
254         }
255         else
256         {
257             global = { out.Feature().v*out.Batch().v, out.X().v, out.Y().v };
258         }
259
260         auto local = GetOptimalLocalWorkGroupSizes(global);
261
262         kd.gws0 = global[0];
263         kd.gws1 = global[1];
264         kd.gws2 = global[2];
265
266         kd.lws0 = local[0];
267         kd.lws1 = local[1];
268         kd.lws2 = local[2];
269
270         kd.cldnnStyle.blockWidth = 1;
271         kd.cldnnStyle.blockHeight = 1;
272         kd.cldnnStyle.prefetch = 0;
273         kd.cldnnStyle.inputBlockArraySize = 0;
274         kd.cldnnStyle.inputBlockWidth = 0;
275
276         kd.gemmStyle.globalWorkSizeDX = 1;
277         kd.gemmStyle.globalWorkSizeDY = 1;
278         kd.gemmStyle.globalWorkSizeDZ = 1;
279         kd.gemmStyle.subBlockDimK = 1;
280         kd.gemmStyle.subBlockDimM = 0;
281         kd.gemmStyle.subBlockDimN = 0;
282         kd.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
283         return kd;
284     }
285
286     KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode, int autoTuneIndex) const
287     {
288         if (!Validate(params, options))
289         {
290             return{};
291         }
292
293         KernelData kd = KernelData::Default<fused_conv_eltwise_params>(params);
294         fused_conv_eltwise_params& newParams = *static_cast<fused_conv_eltwise_params*>(kd.params.get());
295
296         if (NeedPaddedInput())
297         {
298             kd.reorderInput = CovolutionUpdateInputParams(newParams);
299         }
300         DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
301
302         if (!CheckWorkGroups(runInfo))
303         {
304             // Internal Error - wrong calculation of global/local work group sizes
305             return{};
306         }
307
308         bool succeed = UpdateWeightsParams(
309             newParams,
310             options,
311             GetSupportedWeightLayouts(newParams),
312             kd.weightsReorderParams);
313
314         if (!succeed)
315         {
316             return{};
317         }
318
319         auto finalKernelName = GetKernelName(newParams);
320         auto cldnnJit = GetJitConstants(newParams, runInfo);
321         auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
322         auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
323
324         auto& kernel = kd.kernels[0];
325         FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.conv.int8_quantization, newParams.conv.output_calibration);
326         kernel.arguments.push_back({ ArgumentDescriptor::Types::SPLIT, 0 });
327         // eltwise's second input
328         if(newParams.second_input_in_output)
329         {
330             kernel.arguments.push_back({ ArgumentDescriptor::Types::OUTPUT, 0 });
331         }
332         else
333         {
334             kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 });
335         }
336         if (!newParams.eltw.output_calibration_factors.empty())
337             kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 1});
338
339         kd.estimatedTime = runInfo.effiency;
340         kd.autoTuneIndex = autoTuneIndex;
341
342         return{ kd };
343     }
344
345     std::string fused_conv_eltwise_kernel_base::GetAutoTuneOptions(int autoTuneIndex) const
346     {
347         if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
348         {
349             return autoTuneOptions[autoTuneIndex];
350         }
351
352         return DEFAULT;
353     }
354
355     KernelsData fused_conv_eltwise_kernel_base::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const
356     {
357         return GetCommonKernelsData(params, options, GetAutoTuneOptions(autoTuneIndex), autoTuneIndex);
358     }
359
360     KernelsData fused_conv_eltwise_kernel_base::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
361     {
362         if (!Validate(params, options))
363         {
364             return{};
365         }
366
367         KernelsData res = {};
368
369         for (size_t i = 0; i < autoTuneOptions.size(); i++)
370         {
371             KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
372             if (!kd.empty())
373             {
374                 res.emplace_back(kd[0]);
375             }
376         }
377         
378         return res;
379     }
380
381     static DataTensor GetConvolutionBFYXPaddedTensor(const fused_conv_eltwise_params& cp)
382     {
383         assert(cp.inputs[0].GetDims().size() == 4U);
384
385         DataTensor t = cp.inputs[0];
386         std::vector<Tensor::Pad> pad{ { 0,0 },{ 0,0 },{ 0,0 },{ 0,0 } };
387
388         auto& conv = cp.conv;
389
390         pad[0].before = conv.padding.x;
391         pad[1].before = conv.padding.y;
392
393
394         const auto inputLimitX = (cp.output.X().v - 1) * conv.stride.x + (conv.filterSize.x - 1) * conv.dilation.x + 1;
395         const auto inputLimitY = (cp.output.Y().v - 1) * conv.stride.y + (conv.filterSize.y - 1) * conv.dilation.y + 1;
396
397         pad[0].after = (size_t)std::max((int)inputLimitX - (int)t.X().v - (int)pad[0].before, (int)0);
398         pad[1].after = (size_t)std::max((int)inputLimitY - (int)t.Y().v - (int)pad[1].before, (int)0);
399
400         Tensor::NDims dims(4);
401         const Tensor::NDims& orgDims = cp.inputs[0].GetDims();
402         size_t pitch = 1;
403         for (size_t i = 0; i < dims.size(); i++)
404         {
405             dims[i].pad = pad[i];
406             dims[i].v = orgDims[i].v;
407             dims[i].pitch = pitch;
408             pitch *= dims[i].LogicalDimPadded();
409         }
410
411         return{ dims, t.GetDType(), t.GetLayout() };
412     }
413
414     bool CheckConvolutionPaddedInputDesc(const fused_conv_eltwise_params& params, const DataTensor& reqDesc)
415     {
416         bool properPadding =
417             reqDesc.X().pad.before <= params.inputs[0].X().pad.before &&
418             reqDesc.Y().pad.before <= params.inputs[0].Y().pad.before &&
419             reqDesc.Feature().pad.before <= params.inputs[0].Feature().pad.before &&
420             reqDesc.Batch().pad.before <= params.inputs[0].Batch().pad.before;
421
422         properPadding &=
423             reqDesc.X().pad.after <= params.inputs[0].X().pad.after &&
424             reqDesc.Y().pad.after <= params.inputs[0].Y().pad.after &&
425             reqDesc.Feature().pad.after <= params.inputs[0].Feature().pad.after &&
426             reqDesc.Batch().pad.after <= params.inputs[0].Batch().pad.after;
427
428         properPadding &= ((params.conv.padding.x == 0 && params.conv.padding.y == 0) || params.inputs[0].GetPaddedVal() == 0.f);
429
430         return properPadding;
431     }
432
433     bool CovolutionUpdateInputParams(fused_conv_eltwise_params& params)
434     {
435         const auto req_input = GetConvolutionBFYXPaddedTensor(params);
436         const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input);
437
438         if (!bProperInputDesc)
439         {
440             params.inputs[0] = req_input;
441             return true;
442         }
443
444         return false;
445     }
446
447     bool FusedConvolutionEltwiseCheckInput(const Params& p, const optional_params& o)
448     {
449         const fused_conv_eltwise_params& params = static_cast<const fused_conv_eltwise_params&>(p);
450         const fused_conv_eltwise_optional_params& optParams = static_cast<const fused_conv_eltwise_optional_params&>(o);
451
452         const auto req_input = GetConvolutionBFYXPaddedTensor(params);
453         const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input);
454         const bool bInputPadded = optParams.allowInputReordering || bProperInputDesc;
455
456         if (!bInputPadded)
457         {
458             return false;
459         }
460
461         return true;
462     }
463
464 }