inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "fused_conv_eltwise_kernel_base.h"
  18 #include "kernel_selector_utils.h"
  19 #include "common_tools.h"
  20
  21 namespace kernel_selector
  22 {
  23     std::string fused_conv_eltwise_params::to_string() const
  24     {
  25         std::stringstream s;
  26
  27         s << base_params::to_string() << "_";
  28         if (bias.empty())
  29         {
  30             s << "no_bias" << "_";
  31         }
  32         else
  33         {
  34             s << "bias_" << bias[0].PhysicalSize() << "_";
  35         }
  36
  37         s << conv.filterSize.x << "_" << conv.filterSize.y << "_";
  38         s << conv.stride.x << "_" << conv.stride.y << "_";
  39         s << conv.dilation.x << "_" << conv.dilation.y << "_";
  40         s << conv.padding.x << "_" << conv.padding.y << "_";
  41         s << conv.split;
  42
  43         return s.str();
  44     }
  45
  46     ParamsKey fused_conv_eltwise_params::GetParamsKey() const
  47     {
  48         ParamsKey k = weight_bias_params::GetParamsKey();
  49
  50         if (conv.split > 1)
  51         {
  52             k.EnableFusedConvEltwSplitSupport();
  53         }
  54
  55         if (conv.dilation.x != 1 ||
  56             conv.dilation.y != 1)
  57         {
  58             k.EnableFusedConvEltwDilation();
  59         }
  60
  61         if (conv.depthwise_separable_opt)
  62         {
  63             k.EnableFusedConvEltwDepthwiseSeparableOpt();
  64         }
  65
  66         if (conv.transposed)
  67         {
  68             k.EnableFusedConvEltwTranspose();
  69         }
  70
  71         if (conv.int8_quantization)
  72         {
  73             k.EnableFusedConvEltwInt8Quantization();
  74         }
  75
  76         if (conv.output_calibration)
  77         {
  78             k.EnableFusedConvEltwOutputCalibration();
  79         }
  80
  81         if (conv.local_convolution)
  82         {
  83             k.EnableFusedConvEltwLocalConvolution();
  84         }
  85
  86         if (second_input_in_output)
  87         {
  88             k.EnableFusedConvEltwiseRWOutOpt();
  89         }
  90
  91         return k;
  92     }
  93
  94     bool fused_conv_eltwise_kernel_base::Validate(const Params& p, const optional_params& o) const
  95     {
  96         if (p.GetType() != KernelType::FUSED_CONV_ELTWISE ||
  97             o.GetType() != KernelType::FUSED_CONV_ELTWISE)
  98         {
  99             return false;
 100         }
 101
 102         const fused_conv_eltwise_params& params = static_cast<const fused_conv_eltwise_params&>(p);
 103         const fused_conv_eltwise_optional_params& optParams = static_cast<const fused_conv_eltwise_optional_params&>(o);
 104
 105         bool bSupportedWeightsLayout = false;
 106
 107         for (WeightsLayout l : GetSupportedWeightLayouts(params))
 108         {
 109             bSupportedWeightsLayout |= params.weights.GetLayout() == l;
 110         }
 111
 112         const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering;
 113
 114         if (!bWeightsOK)
 115         {
 116             return false;
 117         }
 118
 119         return true;
 120     }
 121
 122     JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const
 123     {
 124         JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params);
 125         const auto& padding = params.conv.padding;
 126         const auto& input = params.inputs[0];
 127
 128         int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() - padding.x*input.X().pitch - input.Y().pitch*padding.y;
 129         input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0);
 130
 131         mem_consts.AddConstants({
 132             MakeJitConstant("STRIDE",                       params.conv.stride),
 133             MakeJitConstant("PADDING",                      params.conv.padding),
 134             MakeJitConstant("DILATION",                     params.conv.dilation),
 135             MakeJitConstant("FILTER_ARRAY_NUM",             params.conv.split),
 136             MakeJitConstant("INPUT0_OFFSET_WITH_PADDING",   input_offset_with_padding),
 137             MakeJitConstant("DEPTHWISE_SEPARABLE_OPT",      params.conv.depthwise_separable_opt),
 138             MakeJitConstant("QUANTIZATION_TERM",            params.conv.int8_quantization),
 139         });
 140
 141         if (params.conv.int8_quantization)
 142         {
 143             mem_consts.AddConstants({ MakeJitConstant("W_QF", params.conv.weights_quantization_factors[0]) });
 144             mem_consts.AddConstants({ MakeJitConstant("I_QF",params.conv.input_quantization_factor) });
 145
 146             if (params.conv.output_calibration)
 147             {
 148                 mem_consts.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.conv.output_calibration));
 149                 mem_consts.AddConstant(MakeJitConstant("O_QF", params.conv.output_calibration_factors[0]));
 150
 151             }
 152             else
 153                 mem_consts.AddConstants({ MakeJitConstant("O_QF", params.conv.output_quantization_factor) });
 154         }
 155
 156         if (params.conv.local_convolution)
 157         {
 158             mem_consts.AddConstants({ MakeJitConstant("LOCAL_CONVOLUTION", params.conv.local_convolution) });
 159         }
 160
 161         JitConstants eltw_activations = MakeActivationJitConstants(params.eltw.activation, "_ELTW");
 162         mem_consts.Merge(eltw_activations);
 163
 164         mem_consts.AddConstant(MakeJitConstant("IN_OUT_OPT", params.second_input_in_output ? 1 : 0));
 165
 166         std::vector<uint32_t> unrollLoopParams{
 167             params.conv.filterSize.x,
 168             params.conv.filterSize.y,
 169             (uint32_t)kd.gemmStyle.globalWorkSizeDX,
 170             (uint32_t)kd.gemmStyle.globalWorkSizeDY,
 171             (uint32_t)kd.gemmStyle.globalWorkSizeDZ,
 172             (uint32_t)kd.gemmStyle.subBlockDimM,
 173             (uint32_t)kd.gemmStyle.subBlockDimK,
 174             (uint32_t)kd.gemmStyle.subBlockDimN
 175         };
 176
 177         auto loopCount = *std::max_element(unrollLoopParams.begin(), unrollLoopParams.end());
 178
 179         JitConstants mem_consts_loop = MakeLoopUnrollParamsJitConstants(loopCount);
 180         mem_consts.Merge(mem_consts_loop);
 181
 182         return mem_consts;
 183     }
 184
 185     bool fused_conv_eltwise_kernel_base::CheckWorkGroups(const fused_conv_eltwise_kernel_base::DispatchData& kd)
 186     {
 187         if (kd.gws0 == 0 ||
 188             kd.gws1 == 0 ||
 189             kd.gws2 == 0 ||
 190             kd.lws0 == 0 ||
 191             kd.lws1 == 0 ||
 192             kd.lws2 == 0)
 193         {
 194             return false;
 195         }
 196
 197         if ((kd.gws0 % kd.lws0) != 0 ||
 198             (kd.gws1 % kd.lws1) != 0 ||
 199             (kd.gws2 % kd.lws2) != 0)
 200         {
 201             return false;
 202         }
 203
 204         return true;
 205     }
 206
 207     namespace
 208     {
 209         bool CheckTensorForSplit(const DataTensor& t, uint32_t split)
 210         {
 211             if (t.PitchesDifferFromLogicalDims())
 212             {
 213                 auto feature = t.Feature();
 214                 auto featureIndex = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::FEATURE);
 215                 if (featureIndex >= 0 && featureIndex+1 < (int)DataTensor::ChannelsCount(t.GetLayout()))
 216                 {
 217                     if (feature.v*split <= t.GetDims()[featureIndex+1].pitch)
 218                     {
 219                         Tensor::NDims newDims = t.GetDims();
 220                         newDims[featureIndex].v = feature.v*split;
 221
 222                         DataTensor newTensor{ newDims, t.GetDType(), t.GetLayout(), t.GetViewOffset(), t.PhysicalSize(), t.GetPaddedVal()};
 223
 224                         if (newTensor.PitchesDifferFromLogicalDims() == false)
 225                         {
 226                             return true;
 227                         }
 228                     }
 229                 }
 230
 231                 return false;
 232             }
 233
 234             return true;
 235         }
 236     }
 237
 238     bool fused_conv_eltwise_kernel_base::CheckPitchForSplitOnly(const fused_conv_eltwise_params& params)
 239     {
 240         // TODO: it's better to add pitch+offset support than handle this case
 241         return CheckTensorForSplit(params.inputs[0], params.conv.split);
 242     }
 243
 244     fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::SetDefault(const fused_conv_eltwise_params& params, int) const
 245     {
 246         DispatchData kd;
 247
 248         const auto& out = params.output;
 249         kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
 250         std::vector<size_t> global;
 251         if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf)
 252         {
 253             global = { out.X().v, out.Y().v, out.Feature().v*out.Batch().v };
 254         }
 255         else
 256         {
 257             global = { out.Feature().v*out.Batch().v, out.X().v, out.Y().v };
 258         }
 259
 260         auto local = GetOptimalLocalWorkGroupSizes(global);
 261
 262         kd.gws0 = global[0];
 263         kd.gws1 = global[1];
 264         kd.gws2 = global[2];
 265
 266         kd.lws0 = local[0];
 267         kd.lws1 = local[1];
 268         kd.lws2 = local[2];
 269
 270         kd.cldnnStyle.blockWidth = 1;
 271         kd.cldnnStyle.blockHeight = 1;
 272         kd.cldnnStyle.prefetch = 0;
 273         kd.cldnnStyle.inputBlockArraySize = 0;
 274         kd.cldnnStyle.inputBlockWidth = 0;
 275
 276         kd.gemmStyle.globalWorkSizeDX = 1;
 277         kd.gemmStyle.globalWorkSizeDY = 1;
 278         kd.gemmStyle.globalWorkSizeDZ = 1;
 279         kd.gemmStyle.subBlockDimK = 1;
 280         kd.gemmStyle.subBlockDimM = 0;
 281         kd.gemmStyle.subBlockDimN = 0;
 282         kd.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
 283         return kd;
 284     }
 285
 286     KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode, int autoTuneIndex) const
 287     {
 288         if (!Validate(params, options))
 289         {
 290             return{};
 291         }
 292
 293         KernelData kd = KernelData::Default<fused_conv_eltwise_params>(params);
 294         fused_conv_eltwise_params& newParams = *static_cast<fused_conv_eltwise_params*>(kd.params.get());
 295
 296         if (NeedPaddedInput())
 297         {
 298             kd.reorderInput = CovolutionUpdateInputParams(newParams);
 299         }
 300         DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
 301
 302         if (!CheckWorkGroups(runInfo))
 303         {
 304             // Internal Error - wrong calculation of global/local work group sizes
 305             return{};
 306         }
 307
 308         bool succeed = UpdateWeightsParams(
 309             newParams,
 310             options,
 311             GetSupportedWeightLayouts(newParams),
 312             kd.weightsReorderParams);
 313
 314         if (!succeed)
 315         {
 316             return{};
 317         }
 318
 319         auto finalKernelName = GetKernelName(newParams);
 320         auto cldnnJit = GetJitConstants(newParams, runInfo);
 321         auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
 322         auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
 323
 324         auto& kernel = kd.kernels[0];
 325         FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.conv.int8_quantization, newParams.conv.output_calibration);
 326         kernel.arguments.push_back({ ArgumentDescriptor::Types::SPLIT, 0 });
 327         // eltwise's second input
 328         if(newParams.second_input_in_output)
 329         {
 330             kernel.arguments.push_back({ ArgumentDescriptor::Types::OUTPUT, 0 });
 331         }
 332         else
 333         {
 334             kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 });
 335         }
 336         if (!newParams.eltw.output_calibration_factors.empty())
 337             kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 1});
 338
 339         kd.estimatedTime = runInfo.effiency;
 340         kd.autoTuneIndex = autoTuneIndex;
 341
 342         return{ kd };
 343     }
 344
 345     std::string fused_conv_eltwise_kernel_base::GetAutoTuneOptions(int autoTuneIndex) const
 346     {
 347         if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
 348         {
 349             return autoTuneOptions[autoTuneIndex];
 350         }
 351
 352         return DEFAULT;
 353     }
 354
 355     KernelsData fused_conv_eltwise_kernel_base::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const
 356     {
 357         return GetCommonKernelsData(params, options, GetAutoTuneOptions(autoTuneIndex), autoTuneIndex);
 358     }
 359
 360     KernelsData fused_conv_eltwise_kernel_base::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
 361     {
 362         if (!Validate(params, options))
 363         {
 364             return{};
 365         }
 366
 367         KernelsData res = {};
 368
 369         for (size_t i = 0; i < autoTuneOptions.size(); i++)
 370         {
 371             KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
 372             if (!kd.empty())
 373             {
 374                 res.emplace_back(kd[0]);
 375             }
 376         }
 377
 378         return res;
 379     }
 380
 381     static DataTensor GetConvolutionBFYXPaddedTensor(const fused_conv_eltwise_params& cp)
 382     {
 383         assert(cp.inputs[0].GetDims().size() == 4U);
 384
 385         DataTensor t = cp.inputs[0];
 386         std::vector<Tensor::Pad> pad{ { 0,0 },{ 0,0 },{ 0,0 },{ 0,0 } };
 387
 388         auto& conv = cp.conv;
 389
 390         pad[0].before = conv.padding.x;
 391         pad[1].before = conv.padding.y;
 392
 393
 394         const auto inputLimitX = (cp.output.X().v - 1) * conv.stride.x + (conv.filterSize.x - 1) * conv.dilation.x + 1;
 395         const auto inputLimitY = (cp.output.Y().v - 1) * conv.stride.y + (conv.filterSize.y - 1) * conv.dilation.y + 1;
 396
 397         pad[0].after = (size_t)std::max((int)inputLimitX - (int)t.X().v - (int)pad[0].before, (int)0);
 398         pad[1].after = (size_t)std::max((int)inputLimitY - (int)t.Y().v - (int)pad[1].before, (int)0);
 399
 400         Tensor::NDims dims(4);
 401         const Tensor::NDims& orgDims = cp.inputs[0].GetDims();
 402         size_t pitch = 1;
 403         for (size_t i = 0; i < dims.size(); i++)
 404         {
 405             dims[i].pad = pad[i];
 406             dims[i].v = orgDims[i].v;
 407             dims[i].pitch = pitch;
 408             pitch *= dims[i].LogicalDimPadded();
 409         }
 410
 411         return{ dims, t.GetDType(), t.GetLayout() };
 412     }
 413
 414     bool CheckConvolutionPaddedInputDesc(const fused_conv_eltwise_params& params, const DataTensor& reqDesc)
 415     {
 416         bool properPadding =
 417             reqDesc.X().pad.before <= params.inputs[0].X().pad.before &&
 418             reqDesc.Y().pad.before <= params.inputs[0].Y().pad.before &&
 419             reqDesc.Feature().pad.before <= params.inputs[0].Feature().pad.before &&
 420             reqDesc.Batch().pad.before <= params.inputs[0].Batch().pad.before;
 421
 422         properPadding &=
 423             reqDesc.X().pad.after <= params.inputs[0].X().pad.after &&
 424             reqDesc.Y().pad.after <= params.inputs[0].Y().pad.after &&
 425             reqDesc.Feature().pad.after <= params.inputs[0].Feature().pad.after &&
 426             reqDesc.Batch().pad.after <= params.inputs[0].Batch().pad.after;
 427
 428         properPadding &= ((params.conv.padding.x == 0 && params.conv.padding.y == 0) || params.inputs[0].GetPaddedVal() == 0.f);
 429
 430         return properPadding;
 431     }
 432
 433     bool CovolutionUpdateInputParams(fused_conv_eltwise_params& params)
 434     {
 435         const auto req_input = GetConvolutionBFYXPaddedTensor(params);
 436         const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input);
 437
 438         if (!bProperInputDesc)
 439         {
 440             params.inputs[0] = req_input;
 441             return true;
 442         }
 443
 444         return false;
 445     }
 446
 447     bool FusedConvolutionEltwiseCheckInput(const Params& p, const optional_params& o)
 448     {
 449         const fused_conv_eltwise_params& params = static_cast<const fused_conv_eltwise_params&>(p);
 450         const fused_conv_eltwise_optional_params& optParams = static_cast<const fused_conv_eltwise_optional_params&>(o);
 451
 452         const auto req_input = GetConvolutionBFYXPaddedTensor(params);
 453         const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input);
 454         const bool bInputPadded = optParams.allowInputReordering || bProperInputDesc;
 455
 456         if (!bInputPadded)
 457         {
 458             return false;
 459         }
 460
 461         return true;
 462     }
 463
 464 }