inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp

   1 /*
   2 // Copyright (c) 2018 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "fused_conv_eltwise_kernel_bfyx_1x1_opt.h"
  18 #include "kernel_selector_utils.h"
  19
  20 namespace kernel_selector {
  21
  22     ParamsKey fused_conv_eltwise_kernel_bfyx_1x1_opt::GetSupportedKey() const
  23         {
  24         ParamsKey k;
  25         k.EnableInputDataType(Datatype::F32);
  26         k.EnableInputWeightsType(WeightsType::F32);
  27         k.EnableOutputDataType(Datatype::F32);
  28         k.EnableInputLayout(DataLayout::bfyx);
  29         k.EnableOutputLayout(DataLayout::bfyx);
  30         k.EnableTensorOffset();
  31         k.EnableTensorPitches();
  32         k.EnableSubGroup();
  33         //k.EnableSubGroupShort(); // we need it for FP16 only. we check it on the Validate phase
  34         k.EnableBiasPerFeature();
  35         k.EnableNonBiasTerm();
  36         k.EnableBatching();
  37         k.EnableFusedConvEltwSplitSupport();
  38         k.EnableFusedConvEltwiseRWOutOpt(); // data for second input are already in output
  39         return k;
  40         }
  41
  42     struct block_params
  43     {
  44         int32_t out_width;
  45         int32_t out_height;
  46         int32_t out_depth;
  47     };
  48
  49     static block_params get_out_block_size(const fused_conv_eltwise_params& p)
  50     {
  51         auto out_depth = 8;
  52
  53         if (p.output.X().v == 7)
  54         {
  55             auto gws0 = p.output.X().v / 7;
  56             auto gws1 = p.output.Y().v / 1;
  57             auto gws2 = 2 * (p.output.Feature().v * p.output.Batch().v) / 8; // process 8 output channels per Workitem
  58
  59             auto compute_units = p.engineInfo.computeUnitsCount;
  60             auto total_threads = (gws0 * gws1 * gws2) / 64;
  61             if (total_threads < compute_units)
  62             {
  63                 out_depth /= 2;
  64                 total_threads *= 2;
  65             }
  66             if (total_threads < compute_units)
  67             {
  68                 out_depth /= 2;
  69                 total_threads *= 2;
  70             }
  71             return { 7,1,out_depth };
  72         }
  73         else if (p.output.X().v == 14)
  74             return { 7,1,8 };
  75         else if (p.output.X().v == 28)
  76             return { 7,2,4 };
  77         else if (p.output.X().v == 56)
  78             return { 8,1,8 };
  79
  80         return { 1,1,1 };
  81     }
  82
  83     std::string fused_conv_eltwise_kernel_bfyx_1x1_opt::GetKernelName(const fused_conv_eltwise_params& params) const
  84     {
  85         if (params.inputs[0].GetDType() == Datatype::F32)
  86         {
  87             return kernelName + "_fp32";
  88         }
  89         else
  90         {
  91             return kernelName + "_fp16";
  92         }
  93     }
  94
  95         bool fused_conv_eltwise_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_params& o) const
  96         {
  97                 if (!fused_conv_eltwise_kernel_base::Validate(p, o) ||
  98                         !FusedConvolutionEltwiseCheckInput(p, o))
  99                 {
 100                         return false;
 101                 }
 102
 103                 const fused_conv_eltwise_params& cp = static_cast<const fused_conv_eltwise_params&>(p);
 104
 105         if (cp.conv.stride.x != 1 || cp.conv.stride.y != 1)
 106             return false;
 107
 108         if (cp.conv.filterSize.x != 1 || cp.conv.filterSize.y != 1)
 109             return false;
 110
 111         if (cp.output.Feature().v % 64 != 0)
 112             return false;
 113
 114         if (cp.conv.padding.x != 0 || cp.conv.padding.y != 0)
 115             return false;
 116
 117         // if block sizes are 1x1, then this algorithm is probably not the best
 118         auto block = get_out_block_size(cp);
 119         if (block.out_width == 1 && block.out_height == 1)
 120             return false;
 121
 122         if (cp.output.X().v % block.out_width != 0)
 123             return false;
 124         if (cp.output.Y().v % block.out_height != 0)
 125             return false;
 126
 127                 return true;
 128         }
 129
 130     std::vector<WeightsLayout> fused_conv_eltwise_kernel_bfyx_1x1_opt::GetSupportedWeightLayouts(const fused_conv_eltwise_params& p) const
 131     {
 132         auto block = get_out_block_size(p);
 133         if (block.out_depth == 8)
 134             return { WeightsLayout::os_iyx_osv64 };
 135         if (block.out_depth == 4)
 136             return { WeightsLayout::os_iyx_osv32 };
 137         if (block.out_depth == 2)
 138             return { WeightsLayout::os_iyx_osv16 };
 139         else
 140             return{ WeightsLayout::yxio };
 141     }
 142
 143     fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_1x1_opt::SetDefault(const fused_conv_eltwise_params& arg, int) const
 144         {
 145         DispatchData runInfo = Parent::SetDefault(arg);
 146
 147         constexpr size_t sub_group_size = 8;
 148
 149         runInfo.effiency = FORCE_PRIORITY_3;
 150
 151         auto block = get_out_block_size(arg);
 152
 153         runInfo.gws0 = arg.output.X().v / block.out_width;
 154         runInfo.gws1 = arg.output.Y().v / block.out_height;
 155         runInfo.gws2 = 2 * (arg.output.Feature().v * arg.output.Batch().v) / block.out_depth; // process 8 output channels per Workitem
 156
 157         runInfo.lws0 = 1;
 158         runInfo.lws1 = 1;
 159         runInfo.lws2 = 2 * sub_group_size;
 160
 161         return runInfo;
 162         }
 163
 164         JitConstants fused_conv_eltwise_kernel_bfyx_1x1_opt::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const
 165         {
 166                 auto jit = Parent::GetJitConstants(params, runInfo);
 167
 168         auto block = get_out_block_size(params);
 169         jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
 170         jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
 171         jit.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block.out_depth));
 172
 173         if (!params.eltw.stride.empty())
 174         {
 175             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x));
 176             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y));
 177         }
 178         else
 179         {
 180             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1));
 181             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1));
 182         }
 183
 184         return jit;
 185         }
 186
 187     KernelsData fused_conv_eltwise_kernel_bfyx_1x1_opt::GetKernelsData(const Params& params, const optional_params& options) const
 188     {
 189         KernelsData kd = GetCommonKernelsData(params, options);
 190         if (!kd.empty())
 191             kd[0].estimatedTime = FORCE_PRIORITY_1;
 192         return kd;
 193     }
 194 }