inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h"
  18
  19 namespace kernel_selector
  20 {
  21
  22     constexpr size_t local_work_size = 16;
  23
  24     ParamsKey ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetSupportedKey() const
  25     {
  26         ParamsKey k;
  27         k.EnableInputDataType(Datatype::F32);
  28         k.EnableInputWeightsType(WeightsType::F16);
  29         k.EnableInputWeightsType(WeightsType::F32);
  30         k.EnableOutputDataType(Datatype::F32);
  31         k.EnableInputLayout(DataLayout::yxfb);
  32         k.EnableOutputLayout(DataLayout::yxfb);
  33         k.EnableTensorOffset();
  34         k.EnableTensorPitches();
  35         k.EnableBiasPerFeature();
  36         k.EnableNonBiasTerm();
  37         k.EnableBatching();
  38         k.EnableSplitSupport();
  39         k.EnableDilation();
  40         k.EnableSubGroup();
  41         return k;
  42     }
  43
  44     namespace {
  45         size_t GetOfmPerWorkitem(size_t filter_ofm_num, size_t localWorkSize)
  46         {
  47             if (filter_ofm_num % (localWorkSize * 4) == 0)
  48                 return 4;
  49             if (filter_ofm_num % (localWorkSize * 2) == 0)
  50                 return 2;
  51             return 1;
  52         }
  53     }
  54
  55     ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::SetDefault(const convolution_params& arg, int autoTuneIndex) const
  56     {
  57         DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
  58
  59         const auto filter_ofm_num = arg.weights.OFM().v;
  60         const auto batch_size = arg.output.Batch().v;
  61
  62         runInfo.lws0 = local_work_size;
  63
  64         // We cannot return 8 because we are processing 4 spatial coordinates for batch1,
  65         // and if we use more than 4 ofm_per_work_item we downgrade simd16 to simd8 which would break this algorithm.
  66         // NOTE: We could return 8 but then we must process only 2 coordinates, which is slower than processing 4 coordinates using blockread4
  67         // TODO: experiment with SIMD8 version of algorithm and check if it could be faster
  68         /*if (output_feature_count % (lws * 8) == 0)
  69         {
  70         run_info.ofm_per_work_item = 8;
  71         run_info.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(run_info.gws1) / 2.0f));
  72         }
  73         else*/
  74         const size_t ofmPerWorkItem = GetOfmPerWorkitem(filter_ofm_num, local_work_size);
  75         if (ofmPerWorkItem == 4)
  76         {
  77             // We compute multiple spatial coordinates "x" in a single workitem that's why we must divide
  78             runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 4.0f));
  79         }
  80         else if (ofmPerWorkItem == 2)
  81         {
  82             runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
  83         }
  84         else
  85         {
  86             runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
  87         }
  88
  89         runInfo.gws0 = filter_ofm_num * batch_size / ofmPerWorkItem;
  90
  91         return runInfo;
  92     }
  93
  94     JitConstants ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetJitConstants(const convolution_params& params, const DispatchData& kd) const
  95     {
  96         auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, kd);
  97
  98         size_t ofmPerWorkItem = GetOfmPerWorkitem(params.weights.OFM().v, local_work_size);
  99         cldnn_jit.AddConstant(MakeJitConstant("USE_VECTOR", ofmPerWorkItem));
 100         if (ofmPerWorkItem == 8)
 101         {
 102             cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 2));
 103         }
 104         else if (ofmPerWorkItem == 4)
 105         {
 106             cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 4));
 107         }
 108         else
 109         {
 110             cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 8));
 111         }
 112
 113         cldnn_jit.AddConstant(MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem)); // how many output feature maps for a single batch will a single work item produce
 114         cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0));
 115         return cldnn_jit;
 116     }
 117
 118     bool ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::Validate(const Params& p, const optional_params& o) const
 119     {
 120         if (!ConvolutionKernelBase::Validate(p, o))
 121         {
 122             return false;
 123         }
 124
 125         const convolution_params& params = static_cast<const convolution_params&>(p);
 126
 127         if (!CheckPitchForSplitOnly(params))
 128         {
 129             return false;
 130         }
 131
 132         const auto filter_ofm_num = params.weights.OFM().v;
 133         const auto batch_size = params.output.Batch().v;
 134
 135         const bool bInputValidated =
 136             (filter_ofm_num > 0) &&
 137             (batch_size == 1) &&    // current implementation doesn't support batching
 138                                     // (subgorup is along batch*ofm and trying to block read filter/bias along batch and filter doesn't contain batching).
 139             (params.output.Feature().v == filter_ofm_num);
 140
 141         if (!bInputValidated)
 142         {
 143             return false;
 144         }
 145
 146         if ((filter_ofm_num * batch_size) % 16 != 0)
 147         {
 148             return false;
 149         }
 150
 151         return true;
 152     }
 153
 154     KernelsData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetKernelsData(const Params& params, const optional_params& options) const
 155     {
 156         return GetTunedKernelsDataByIndex(params, options);
 157     }
 158 }