2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "fused_conv_eltwise_kernel_base.h"
18 #include "kernel_selector_utils.h"
19 #include "common_tools.h"
21 namespace kernel_selector
23 std::string fused_conv_eltwise_params::to_string() const
27 s << base_params::to_string() << "_";
30 s << "no_bias" << "_";
34 s << "bias_" << bias[0].PhysicalSize() << "_";
37 s << conv.filterSize.x << "_" << conv.filterSize.y << "_";
38 s << conv.stride.x << "_" << conv.stride.y << "_";
39 s << conv.dilation.x << "_" << conv.dilation.y << "_";
40 s << conv.padding.x << "_" << conv.padding.y << "_";
46 ParamsKey fused_conv_eltwise_params::GetParamsKey() const
48 ParamsKey k = weight_bias_params::GetParamsKey();
52 k.EnableFusedConvEltwSplitSupport();
55 if (conv.dilation.x != 1 ||
58 k.EnableFusedConvEltwDilation();
61 if (conv.depthwise_separable_opt)
63 k.EnableFusedConvEltwDepthwiseSeparableOpt();
68 k.EnableFusedConvEltwTranspose();
71 if (conv.int8_quantization)
73 k.EnableFusedConvEltwInt8Quantization();
76 if (conv.output_calibration)
78 k.EnableFusedConvEltwOutputCalibration();
81 if (conv.local_convolution)
83 k.EnableFusedConvEltwLocalConvolution();
86 if (second_input_in_output)
88 k.EnableFusedConvEltwiseRWOutOpt();
94 bool fused_conv_eltwise_kernel_base::Validate(const Params& p, const optional_params& o) const
96 if (p.GetType() != KernelType::FUSED_CONV_ELTWISE ||
97 o.GetType() != KernelType::FUSED_CONV_ELTWISE)
102 const fused_conv_eltwise_params& params = static_cast<const fused_conv_eltwise_params&>(p);
103 const fused_conv_eltwise_optional_params& optParams = static_cast<const fused_conv_eltwise_optional_params&>(o);
105 bool bSupportedWeightsLayout = false;
107 for (WeightsLayout l : GetSupportedWeightLayouts(params))
109 bSupportedWeightsLayout |= params.weights.GetLayout() == l;
112 const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering;
122 JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const
124 JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params);
125 const auto& padding = params.conv.padding;
126 const auto& input = params.inputs[0];
128 int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() - padding.x*input.X().pitch - input.Y().pitch*padding.y;
129 input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0);
131 mem_consts.AddConstants({
132 MakeJitConstant("STRIDE", params.conv.stride),
133 MakeJitConstant("PADDING", params.conv.padding),
134 MakeJitConstant("DILATION", params.conv.dilation),
135 MakeJitConstant("FILTER_ARRAY_NUM", params.conv.split),
136 MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding),
137 MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", params.conv.depthwise_separable_opt),
138 MakeJitConstant("QUANTIZATION_TERM", params.conv.int8_quantization),
141 if (params.conv.int8_quantization)
143 mem_consts.AddConstants({ MakeJitConstant("W_QF", params.conv.weights_quantization_factors[0]) });
144 mem_consts.AddConstants({ MakeJitConstant("I_QF",params.conv.input_quantization_factor) });
146 if (params.conv.output_calibration)
148 mem_consts.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.conv.output_calibration));
149 mem_consts.AddConstant(MakeJitConstant("O_QF", params.conv.output_calibration_factors[0]));
153 mem_consts.AddConstants({ MakeJitConstant("O_QF", params.conv.output_quantization_factor) });
156 if (params.conv.local_convolution)
158 mem_consts.AddConstants({ MakeJitConstant("LOCAL_CONVOLUTION", params.conv.local_convolution) });
161 JitConstants eltw_activations = MakeActivationJitConstants(params.eltw.activation, "_ELTW");
162 mem_consts.Merge(eltw_activations);
164 mem_consts.AddConstant(MakeJitConstant("IN_OUT_OPT", params.second_input_in_output ? 1 : 0));
166 std::vector<uint32_t> unrollLoopParams{
167 params.conv.filterSize.x,
168 params.conv.filterSize.y,
169 (uint32_t)kd.gemmStyle.globalWorkSizeDX,
170 (uint32_t)kd.gemmStyle.globalWorkSizeDY,
171 (uint32_t)kd.gemmStyle.globalWorkSizeDZ,
172 (uint32_t)kd.gemmStyle.subBlockDimM,
173 (uint32_t)kd.gemmStyle.subBlockDimK,
174 (uint32_t)kd.gemmStyle.subBlockDimN
177 auto loopCount = *std::max_element(unrollLoopParams.begin(), unrollLoopParams.end());
179 JitConstants mem_consts_loop = MakeLoopUnrollParamsJitConstants(loopCount);
180 mem_consts.Merge(mem_consts_loop);
185 bool fused_conv_eltwise_kernel_base::CheckWorkGroups(const fused_conv_eltwise_kernel_base::DispatchData& kd)
197 if ((kd.gws0 % kd.lws0) != 0 ||
198 (kd.gws1 % kd.lws1) != 0 ||
199 (kd.gws2 % kd.lws2) != 0)
209 bool CheckTensorForSplit(const DataTensor& t, uint32_t split)
211 if (t.PitchesDifferFromLogicalDims())
213 auto feature = t.Feature();
214 auto featureIndex = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::FEATURE);
215 if (featureIndex >= 0 && featureIndex+1 < (int)DataTensor::ChannelsCount(t.GetLayout()))
217 if (feature.v*split <= t.GetDims()[featureIndex+1].pitch)
219 Tensor::NDims newDims = t.GetDims();
220 newDims[featureIndex].v = feature.v*split;
222 DataTensor newTensor{ newDims, t.GetDType(), t.GetLayout(), t.GetViewOffset(), t.PhysicalSize(), t.GetPaddedVal()};
224 if (newTensor.PitchesDifferFromLogicalDims() == false)
238 bool fused_conv_eltwise_kernel_base::CheckPitchForSplitOnly(const fused_conv_eltwise_params& params)
240 // TODO: it's better to add pitch+offset support than handle this case
241 return CheckTensorForSplit(params.inputs[0], params.conv.split);
244 fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::SetDefault(const fused_conv_eltwise_params& params, int) const
248 const auto& out = params.output;
249 kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
250 std::vector<size_t> global;
251 if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf)
253 global = { out.X().v, out.Y().v, out.Feature().v*out.Batch().v };
257 global = { out.Feature().v*out.Batch().v, out.X().v, out.Y().v };
260 auto local = GetOptimalLocalWorkGroupSizes(global);
270 kd.cldnnStyle.blockWidth = 1;
271 kd.cldnnStyle.blockHeight = 1;
272 kd.cldnnStyle.prefetch = 0;
273 kd.cldnnStyle.inputBlockArraySize = 0;
274 kd.cldnnStyle.inputBlockWidth = 0;
276 kd.gemmStyle.globalWorkSizeDX = 1;
277 kd.gemmStyle.globalWorkSizeDY = 1;
278 kd.gemmStyle.globalWorkSizeDZ = 1;
279 kd.gemmStyle.subBlockDimK = 1;
280 kd.gemmStyle.subBlockDimM = 0;
281 kd.gemmStyle.subBlockDimN = 0;
282 kd.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
286 KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode, int autoTuneIndex) const
288 if (!Validate(params, options))
293 KernelData kd = KernelData::Default<fused_conv_eltwise_params>(params);
294 fused_conv_eltwise_params& newParams = *static_cast<fused_conv_eltwise_params*>(kd.params.get());
296 if (NeedPaddedInput())
298 kd.reorderInput = CovolutionUpdateInputParams(newParams);
300 DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
302 if (!CheckWorkGroups(runInfo))
304 // Internal Error - wrong calculation of global/local work group sizes
308 bool succeed = UpdateWeightsParams(
311 GetSupportedWeightLayouts(newParams),
312 kd.weightsReorderParams);
319 auto finalKernelName = GetKernelName(newParams);
320 auto cldnnJit = GetJitConstants(newParams, runInfo);
321 auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
322 auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
324 auto& kernel = kd.kernels[0];
325 FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.conv.int8_quantization, newParams.conv.output_calibration);
326 kernel.arguments.push_back({ ArgumentDescriptor::Types::SPLIT, 0 });
327 // eltwise's second input
328 if(newParams.second_input_in_output)
330 kernel.arguments.push_back({ ArgumentDescriptor::Types::OUTPUT, 0 });
334 kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 });
336 if (!newParams.eltw.output_calibration_factors.empty())
337 kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 1});
339 kd.estimatedTime = runInfo.effiency;
340 kd.autoTuneIndex = autoTuneIndex;
345 std::string fused_conv_eltwise_kernel_base::GetAutoTuneOptions(int autoTuneIndex) const
347 if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
349 return autoTuneOptions[autoTuneIndex];
355 KernelsData fused_conv_eltwise_kernel_base::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const
357 return GetCommonKernelsData(params, options, GetAutoTuneOptions(autoTuneIndex), autoTuneIndex);
360 KernelsData fused_conv_eltwise_kernel_base::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
362 if (!Validate(params, options))
367 KernelsData res = {};
369 for (size_t i = 0; i < autoTuneOptions.size(); i++)
371 KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
374 res.emplace_back(kd[0]);
381 static DataTensor GetConvolutionBFYXPaddedTensor(const fused_conv_eltwise_params& cp)
383 assert(cp.inputs[0].GetDims().size() == 4U);
385 DataTensor t = cp.inputs[0];
386 std::vector<Tensor::Pad> pad{ { 0,0 },{ 0,0 },{ 0,0 },{ 0,0 } };
388 auto& conv = cp.conv;
390 pad[0].before = conv.padding.x;
391 pad[1].before = conv.padding.y;
394 const auto inputLimitX = (cp.output.X().v - 1) * conv.stride.x + (conv.filterSize.x - 1) * conv.dilation.x + 1;
395 const auto inputLimitY = (cp.output.Y().v - 1) * conv.stride.y + (conv.filterSize.y - 1) * conv.dilation.y + 1;
397 pad[0].after = (size_t)std::max((int)inputLimitX - (int)t.X().v - (int)pad[0].before, (int)0);
398 pad[1].after = (size_t)std::max((int)inputLimitY - (int)t.Y().v - (int)pad[1].before, (int)0);
400 Tensor::NDims dims(4);
401 const Tensor::NDims& orgDims = cp.inputs[0].GetDims();
403 for (size_t i = 0; i < dims.size(); i++)
405 dims[i].pad = pad[i];
406 dims[i].v = orgDims[i].v;
407 dims[i].pitch = pitch;
408 pitch *= dims[i].LogicalDimPadded();
411 return{ dims, t.GetDType(), t.GetLayout() };
414 bool CheckConvolutionPaddedInputDesc(const fused_conv_eltwise_params& params, const DataTensor& reqDesc)
417 reqDesc.X().pad.before <= params.inputs[0].X().pad.before &&
418 reqDesc.Y().pad.before <= params.inputs[0].Y().pad.before &&
419 reqDesc.Feature().pad.before <= params.inputs[0].Feature().pad.before &&
420 reqDesc.Batch().pad.before <= params.inputs[0].Batch().pad.before;
423 reqDesc.X().pad.after <= params.inputs[0].X().pad.after &&
424 reqDesc.Y().pad.after <= params.inputs[0].Y().pad.after &&
425 reqDesc.Feature().pad.after <= params.inputs[0].Feature().pad.after &&
426 reqDesc.Batch().pad.after <= params.inputs[0].Batch().pad.after;
428 properPadding &= ((params.conv.padding.x == 0 && params.conv.padding.y == 0) || params.inputs[0].GetPaddedVal() == 0.f);
430 return properPadding;
433 bool CovolutionUpdateInputParams(fused_conv_eltwise_params& params)
435 const auto req_input = GetConvolutionBFYXPaddedTensor(params);
436 const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input);
438 if (!bProperInputDesc)
440 params.inputs[0] = req_input;
447 bool FusedConvolutionEltwiseCheckInput(const Params& p, const optional_params& o)
449 const fused_conv_eltwise_params& params = static_cast<const fused_conv_eltwise_params&>(p);
450 const fused_conv_eltwise_optional_params& optParams = static_cast<const fused_conv_eltwise_optional_params&>(o);
452 const auto req_input = GetConvolutionBFYXPaddedTensor(params);
453 const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input);
454 const bool bInputPadded = optParams.allowInputReordering || bProperInputDesc;