inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.cpp

   1 // Copyright (c) 2018-2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "convolution_kernel_imad_3x3.h"
  17 #include "kernel_selector_utils.h"
  18 #include "common_tools.h"
  19 #include <vector>
  20
  21 //
  22 // Kernel specific constants
  23 //
  24 #define SIMD_SIZE 16
  25 // Threshold value to calculate the block size.
  26 #define OUT_BLOCK_THRESHOLD 7
  27
  28 static bool getOutBlock_WH(size_t output_size,
  29                            size_t stride,
  30                            size_t kernel_size,
  31                            size_t& output_block_w,
  32                            size_t& output_block_h) {
  33     bool verify_output_ranges = false;
  34
  35     output_block_w = output_block_h = 0;
  36
  37     size_t upper_border = output_size < SIMD_SIZE ? output_size : SIMD_SIZE;
  38
  39     size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1)) / stride;
  40
  41     size_t max_posible_tile_size = upper_border < stride_restrictions ? upper_border : stride_restrictions;
  42
  43     if (output_size % max_posible_tile_size == 0) {
  44         output_block_w = max_posible_tile_size;
  45     } else {
  46         size_t min_horisontal_block_size = 2;  // 4;
  47
  48         size_t block_size = 0;
  49
  50         for (size_t i = min_horisontal_block_size; i < max_posible_tile_size; i++) {
  51             if (output_size % i == 0)
  52                 block_size = i;
  53         }
  54
  55         if (block_size != 0) {
  56             output_block_w = block_size;
  57         } else {
  58             output_block_w = max_posible_tile_size;
  59             verify_output_ranges = true;
  60         }
  61     }
  62
  63     if (output_block_w <= 4)
  64         output_block_h = output_block_w;
  65     else
  66         output_block_h = 1;
  67
  68     return verify_output_ranges;
  69 }
  70
  71 namespace kernel_selector {
  72
  73 ParamsKey ConvolutionKernel_imad_3x3::GetSupportedKey() const {
  74     ParamsKey k;
  75     k.EnableInputDataType(Datatype::INT8);
  76     k.EnableInputDataType(Datatype::UINT8);
  77     k.EnableOutputDataType(Datatype::INT8);
  78     k.EnableOutputDataType(Datatype::UINT8);
  79     k.EnableInputWeightsType(WeightsType::INT8);
  80     k.EnableInputWeightsType(WeightsType::UINT8);
  81     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
  82     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
  83     k.EnableDifferentInputWeightsTypes();
  84     k.EnableTensorOffset();
  85     k.EnableTensorPitches();
  86     k.EnableDilation();
  87     k.EnableBiasPerFeature();
  88     k.EnableNonBiasTerm();
  89     k.EnableBatching();
  90     k.EnableInt8Quantization();
  91     k.EnableOutputCalibration();
  92     k.DisableTuning();
  93     return k;
  94 }
  95
  96 KernelsData ConvolutionKernel_imad_3x3::GetKernelsData(const Params& params, const optional_params& options) const {
  97     return GetCommonKernelsData(params, options);
  98 }
  99
 100 JitConstants ConvolutionKernel_imad_3x3::GetJitConstants(const convolution_params& params,
 101                                                          const DispatchData& kd) const {
 102     auto mem_consts = Parent::GetJitConstants(params, kd);
 103
 104     auto activation_constants =
 105         MakeActivationJitConstants(params.activation, "_CONV");
 106     mem_consts.Merge(activation_constants);
 107
 108     const auto& input = params.inputs[0];
 109     const auto& output = params.output;
 110
 111     const auto& iDims = input.GetDims();
 112     const auto& oDims = output.GetDims();
 113     const auto& weights = params.weights;
 114     const auto& wDims = weights.GetDims();
 115     const int iX = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::X);
 116     const int iY = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::Y);
 117     const int iF = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::FEATURE);
 118     const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
 119     const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
 120     const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
 121     mem_consts.AddConstants({
 122         MakeJitConstant("_IW", iDims[iX].v),
 123         MakeJitConstant("_IH", iDims[iY].v),
 124         MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)),
 125         MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after),
 126         MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after),
 127         MakeJitConstant("_OW", oDims[oX].v),
 128         MakeJitConstant("_OH", oDims[oY].v),
 129         MakeJitConstant("_OD", wDims[wOD].v),
 130         MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after),
 131         MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after),
 132         MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
 133         MakeJitConstant("K_HEIGHT", wDims[iY].v),
 134         MakeJitConstant("K_WIDTH", wDims[iX].v),
 135         MakeJitConstant("K_STRIDE", params.stride.x),  // X and Y must be equal
 136     });
 137
 138     size_t obw, obh;
 139     getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[iX].v, obw, obh);
 140     mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", obw), MakeJitConstant("OUT_BLOCK_HEIGHT", obh)});
 141
 142     return mem_consts;
 143 }  // GetJitConstants
 144
 145 ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_3x3::SetDefault(const convolution_params& params,
 146                                                                            int) const {
 147     DispatchData kd;
 148
 149     const auto& in = params.inputs[0];
 150     const auto& output = params.output;
 151     const auto& weights = params.weights;
 152     const auto& iDims = in.GetDims();
 153     const auto& oDims = output.GetDims();
 154     const auto& wDims = weights.GetDims();
 155     const int iX = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::X);
 156     const int iY = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::Y);
 157     const int iB = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::BATCH);
 158     const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
 159     const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
 160
 161     size_t otw, oth;
 162     getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[iX].v, otw, oth);
 163
 164     std::vector<size_t> global = {// globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
 165                                   // number of tiles needed to cover output width
 166                                   (((iDims[iX].v / params.stride.x) + (otw - 1)) / otw),
 167
 168                                   // globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
 169                                   // number of tiles needed to cover output height
 170                                   (((iDims[iY].v / params.stride.y) + (oth - 1)) / oth),
 171
 172                                   // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
 173                                   // round depth range up
 174                                   ((wDims[wOD].v * iDims[iB].v) + ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE))};
 175
 176     std::vector<size_t> local = {1, 1, SIMD_SIZE};
 177
 178     kd.gws0 = global[0];
 179     kd.gws1 = global[1];
 180     kd.gws2 = global[2];
 181
 182     kd.lws0 = local[0];
 183     kd.lws1 = local[1];
 184     kd.lws2 = local[2];
 185
 186     kd.cldnnStyle = {0, 0, 0, 0, 0};
 187     kd.gemmStyle = {0, 0, 0, 0, 0, 0};
 188     kd.effiency = FORCE_PRIORITY_1;
 189
 190     return kd;
 191 }  // SetDefault
 192
 193 bool ConvolutionKernel_imad_3x3::Validate(const Params& params, const optional_params& options) const {
 194     if (!Parent::Validate(params, options)) {
 195         return false;
 196     }
 197
 198     KernelData kd = KernelData::Default<convolution_params>(params);
 199     convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
 200
 201     if (newParams.stride.x != newParams.stride.y) {
 202         // Strides must be equial
 203         return false;
 204     } else if ((newParams.filterSize.x != m_FilterSizeX) || (newParams.filterSize.y != m_FilterSizeY)) {
 205         // Kernel does not support such filter size
 206         return false;
 207     } else {
 208         const auto& in = newParams.inputs[0];
 209         const auto& iDims = in.GetDims();
 210         const int iX = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::X);
 211         if (iDims[iX].v % OUT_BLOCK_THRESHOLD != 0) {
 212             // Input size must be multiple of OUT_BLOCK_THRESHOLD
 213             return false;
 214         }
 215     }
 216
 217     return true;
 218 }
 219 }  // namespace kernel_selector