inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp

   1 /*
   2 // Copyright (c) 2018 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "convolution_kernel_bfyx_1x1_opt.h"
  18
  19 namespace kernel_selector
  20 {
  21
  22     convolution_kernel_bfyx_1x1_opt::convolution_kernel_bfyx_1x1_opt() : ConvolutionKernelBase("convolution_gpu_bfyx_1x1_opt")
  23     {
  24     }
  25
  26     ParamsKey convolution_kernel_bfyx_1x1_opt::GetSupportedKey() const
  27     {
  28         ParamsKey k;
  29         k.EnableInputDataType(Datatype::F32);
  30         k.EnableInputWeightsType(WeightsType::F32);
  31         k.EnableOutputDataType(Datatype::F32);
  32         k.EnableInputLayout(DataLayout::bfyx);
  33         k.EnableOutputLayout(DataLayout::bfyx);
  34         k.EnableTensorOffset();
  35         k.EnableTensorPitches();
  36         k.EnableSubGroup();
  37         k.EnableBiasPerFeature();
  38         k.EnableBiasPerOutput();
  39         k.EnableNonBiasTerm();
  40         k.EnableBatching();
  41         return k;
  42     }
  43
  44     struct block_params
  45     {
  46         int32_t out_width;
  47         int32_t out_height;
  48         int32_t out_depth;
  49     };
  50
  51     static block_params get_out_block_size(const convolution_params& p)
  52     {
  53         auto out_depth = 8;
  54
  55         if (p.output.X().v == 7)
  56         {
  57             auto gws0 = p.output.X().v / 7;
  58             auto gws1 = p.output.Y().v / 1;
  59             auto gws2 = 2*(p.output.Feature().v * p.output.Batch().v) / 8 ; // process 8 output channels per Workitem
  60
  61             auto compute_units = p.engineInfo.computeUnitsCount;
  62             auto total_threads = (gws0 * gws1 * gws2) / 64;
  63             if (total_threads < compute_units)
  64             {
  65                 out_depth /= 2;
  66                 total_threads *= 2;
  67             }
  68             if (total_threads < compute_units)
  69             {
  70                 out_depth /= 2;
  71                 total_threads *= 2;
  72             }
  73             return { 7,1,out_depth };
  74         }
  75         else if (p.output.X().v == 14)
  76             return { 7,1,8 };
  77         else if (p.output.X().v == 28)
  78             return { 7,2,4 };
  79         else if (p.output.X().v == 56)
  80             return { 8,1,8 };
  81
  82         return { 1,1,1 };
  83     }
  84
  85
  86     ConvolutionKernelBase::DispatchData convolution_kernel_bfyx_1x1_opt::SetDefault(const convolution_params& cp, int) const
  87     {
  88         DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
  89
  90         constexpr size_t sub_group_size = 8;
  91
  92         runInfo.effiency = FORCE_PRIORITY_3;
  93
  94         auto block = get_out_block_size(cp);
  95
  96         runInfo.gws0 = cp.output.X().v / block.out_width;
  97         runInfo.gws1 = cp.output.Y().v / block.out_height;
  98         runInfo.gws2 = 2*(cp.output.Feature().v * cp.output.Batch().v) / block.out_depth; // process 8 output channels per Workitem
  99
 100         runInfo.lws0 = 1;
 101         runInfo.lws1 = 1;
 102         runInfo.lws2 = 2*sub_group_size;
 103
 104         return runInfo;
 105     }
 106
 107     bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_params& o) const
 108     {
 109         if (!ConvolutionKernelBase::Validate(p, o))
 110         {
 111             return false;
 112         }
 113         const convolution_params& cp = static_cast<const convolution_params&>(p);
 114
 115         if (cp.stride.x != 1 || cp.stride.y != 1)
 116             return false;
 117
 118         if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
 119             return false;
 120
 121         if (cp.output.Feature().v % 64 != 0)
 122             return false;
 123
 124         if (cp.padding.x != 0 || cp.padding.y != 0)
 125             return false;
 126
 127         // if block sizes are 1x1, then this algorithm is probably not the best
 128         auto block = get_out_block_size(cp);
 129         if (block.out_width == 1 && block.out_height == 1)
 130             return false;
 131
 132         if (cp.output.X().v % block.out_width != 0)
 133             return false;
 134         if (cp.output.Y().v % block.out_height != 0)
 135             return false;
 136
 137         return true;
 138     }
 139
 140     JitConstants convolution_kernel_bfyx_1x1_opt::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const
 141     {
 142         auto jit = Parent::GetJitConstants(params, runInfo);
 143
 144         auto block = get_out_block_size(params);
 145         jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
 146         jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
 147         jit.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block.out_depth));
 148
 149         return jit;
 150     }
 151
 152     std::vector<WeightsLayout> convolution_kernel_bfyx_1x1_opt::GetSupportedWeightLayouts(const convolution_params& cp) const
 153     {
 154         auto block = get_out_block_size(cp);
 155         if (block.out_depth == 8)
 156             return { WeightsLayout::os_iyx_osv64 };
 157         if (block.out_depth == 4)
 158             return { WeightsLayout::os_iyx_osv32 };
 159         if (block.out_depth == 2)
 160             return { WeightsLayout::os_iyx_osv16 };
 161         else
 162             return{ WeightsLayout::yxio };
 163     }
 164
 165     KernelsData convolution_kernel_bfyx_1x1_opt::GetKernelsData(const Params& params, const optional_params& options) const
 166     {
 167         KernelsData kd = GetCommonKernelsData(params, options);
 168         if (!kd.empty())
 169             kd[0].estimatedTime = FORCE_PRIORITY_1;
 170         return kd;
 171     }
 172
 173 }