inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.cpp

   1 /*
   2 // Copyright (c) 2018-2019 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "convolution_kernel_imad_3x3.h"
  18 #include "kernel_selector_utils.h"
  19 #include "common_tools.h"
  20
  21 //
  22 // Kernel specific constants
  23 //
  24 #define SIMD_SIZE             16
  25 // Threshold value to calculate the block size.
  26 #define OUT_BLOCK_THRESHOLD   7
  27 // For images 7x7 it's 7 (default), for 14x14 and above it's 14.
  28 #define OUT_BLOCK_WIDTH       7
  29 // For images 7x7 it's 1 (default), for 14x14 and above it's 2.
  30 #define OUT_BLOCK_HEIGHT      1
  31
  32 static void getOutBlock_WH(size_t inW, size_t Stride, size_t Pad, size_t& outW, size_t& outH)
  33 {
  34     outW = OUT_BLOCK_WIDTH * 2;
  35     outH = OUT_BLOCK_HEIGHT * 2;
  36
  37     if ((inW <= OUT_BLOCK_THRESHOLD) ||
  38         (outW * Stride + Pad > SIMD_SIZE)) {
  39         outW = OUT_BLOCK_WIDTH;
  40         outH = OUT_BLOCK_HEIGHT;
  41     }
  42     if (outW * Stride + Pad > SIMD_SIZE) {
  43         outW = outH = 4;
  44     }
  45
  46     assert(outW * Stride + Pad <= SIMD_SIZE);
  47 } // getOutBlock_WH
  48
  49 namespace kernel_selector {
  50
  51     ParamsKey ConvolutionKernel_imad_3x3::GetSupportedKey() const
  52     {
  53         ParamsKey k;
  54         k.EnableInputDataType(Datatype::INT8);
  55         k.EnableInputDataType(Datatype::UINT8);
  56         k.EnableOutputDataType(Datatype::INT8);
  57         k.EnableOutputDataType(Datatype::UINT8);
  58         k.EnableInputWeightsType(WeightsType::INT8);
  59         k.EnableInputWeightsType(WeightsType::UINT8);
  60         k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
  61         k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
  62         k.EnableDifferentInputWeightsTypes();
  63         k.EnableTensorOffset();
  64         k.EnableTensorPitches();
  65         k.EnableDilation();
  66         k.EnableBiasPerFeature();
  67         k.EnableNonBiasTerm();
  68         k.EnableBatching();
  69         k.EnableInt8Quantization();
  70         k.EnableOutputCalibration();
  71         k.DisableTuning();
  72         return k;
  73     }
  74
  75     KernelsData
  76     ConvolutionKernel_imad_3x3::GetKernelsData(
  77                                     const Params&          params,
  78                                     const optional_params& options) const
  79     {
  80         return GetCommonKernelsData(params, options);
  81     }
  82
  83     JitConstants
  84     ConvolutionKernel_imad_3x3::GetJitConstants(
  85                                     const convolution_params& params,
  86                                     const DispatchData&       kd) const
  87     {
  88         auto mem_consts = Parent::GetJitConstants(params, kd);
  89
  90         const auto& input = params.inputs[0];
  91         const auto& output = params.output;
  92
  93         const auto& iDims   = input.GetDims();
  94         const auto& oDims = output.GetDims();
  95         const auto& weights = params.weights;
  96         const auto& wDims   = weights.GetDims();
  97         const int iX  = DataTensor::Channelndex(
  98                             input.GetLayout(), Tensor::DataChannelName::X);
  99         const int iY  = DataTensor::Channelndex(
 100                             input.GetLayout(), Tensor::DataChannelName::Y);
 101         const int iB  = DataTensor::Channelndex(
 102                             input.GetLayout(), Tensor::DataChannelName::BATCH);
 103         const int iF  = DataTensor::Channelndex(
 104                             input.GetLayout(), Tensor::DataChannelName::FEATURE);
 105         const int wOD = WeightsTensor::Channelndex(
 106                             weights.GetLayout(), Tensor::WeightsChannelName::OFM);
 107         const int oX = DataTensor::Channelndex(
 108             output.GetLayout(), Tensor::DataChannelName::X);
 109         const int oY = DataTensor::Channelndex(
 110             output.GetLayout(), Tensor::DataChannelName::Y);
 111         mem_consts.AddConstants({
 112             MakeJitConstant("_IMAD_DEFINES",   1),
 113             //MakeJitConstant("SCALE_FACTOR",     m_ScaleFactor), //(255.0f / 700000.0f);
 114             MakeJitConstant("_IW",              iDims[iX].v),
 115             MakeJitConstant("_IH",              iDims[iY].v),
 116             MakeJitConstant("_ID",              RoundUp(iDims[iF].v, 4)),
 117             MakeJitConstant("IWPAD",            iDims[iX].pad.before + iDims[iX].pad.after),
 118             MakeJitConstant("IHPAD",            iDims[iY].pad.before + iDims[iY].pad.after),
 119             MakeJitConstant("_OW",              oDims[oX].v),
 120             MakeJitConstant("_OH",              oDims[oY].v),
 121             MakeJitConstant("_OD",              wDims[wOD].v),
 122             MakeJitConstant("OWPAD",            oDims[oX].pad.before + oDims[oX].pad.after),
 123             MakeJitConstant("OHPAD",            oDims[oY].pad.before + oDims[oY].pad.after),
 124             MakeJitConstant("SIMD_SIZE",        SIMD_SIZE),
 125             MakeJitConstant("K_HEIGHT",         wDims[iY].v),
 126             MakeJitConstant("K_WIDTH",          wDims[iX].v),
 127             MakeJitConstant("K_STRIDE",         params.stride.x), // X and Y must be equal
 128             MakeJitConstant("BATCH_SIZE",       iDims[iB].v),
 129             MakeJitConstant("WORKGROUP_SIZE",   "SIMD_SIZE"),
 130         });
 131
 132         size_t obw, obh;
 133         getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after,
 134                        obw, obh);
 135         mem_consts.AddConstants({
 136             MakeJitConstant("OUT_BLOCK_WIDTH",  obw),
 137             MakeJitConstant("OUT_BLOCK_HEIGHT", obh)
 138         });
 139
 140         // FM_TILE definition
 141         mem_consts.AddConstants({
 142             MakeJitConstant("IMAD_LENGTH", 4),
 143             MakeJitConstant("SYSTOLIC_DEPTH", 1),
 144             MakeJitConstant("FM_TILE", "(IMAD_LENGTH * SYSTOLIC_DEPTH)")
 145         });
 146
 147         if (input.GetDType() == Datatype::UINT8) {
 148             // For unsigned types IMAD convolution kernel should skip
 149             // all negative values.
 150             mem_consts.AddConstants({
 151                 MakeJitConstant("CONVO_UNSIGNED", 1)
 152             });
 153         }
 154
 155         if (params.output.GetLayout() != DataLayout::b_fs_yx_fsv4) {
 156             mem_consts.AddConstants({
 157                 // Produce unswizzelled results.
 158                 MakeJitConstant("TO_UNSWIZZLE", 1),
 159             });
 160         }
 161
 162         return mem_consts;
 163
 164     } // GetJitConstants
 165
 166
 167     ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_3x3::SetDefault(
 168                                                const convolution_params& params,
 169                                                int) const
 170     {
 171         DispatchData kd;
 172
 173         const auto& in      = params.inputs[0];
 174         const auto& weights = params.weights;
 175         const auto& iDims   = in.GetDims();
 176         const auto& wDims   = weights.GetDims();
 177         const int iX  = DataTensor::Channelndex(
 178                             in.GetLayout(), Tensor::DataChannelName::X);
 179         const int iY  = DataTensor::Channelndex(
 180                             in.GetLayout(), Tensor::DataChannelName::Y);
 181         const int iB  = DataTensor::Channelndex(
 182                             in.GetLayout(), Tensor::DataChannelName::BATCH);
 183         const int wOD = WeightsTensor::Channelndex(
 184                             weights.GetLayout(), Tensor::WeightsChannelName::OFM);
 185
 186         size_t otw, oth;
 187         getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after,
 188                        otw, oth);
 189
 190         std::vector<size_t> global = {
 191             //globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
 192             // number of tiles needed to cover output width
 193             (((iDims[iX].v / params.stride.x) + (otw - 1)) / otw),
 194
 195             //globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
 196             // number of tiles needed to cover output height
 197             (((iDims[iY].v / params.stride.y) + (oth - 1)) / oth),
 198
 199             // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
 200             // round depth range up
 201             ((wDims[wOD].v * iDims[iB].v) + ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE))
 202         };
 203
 204         std::vector<size_t> local = {1, 1, SIMD_SIZE};
 205
 206         kd.gws0 = global[0];
 207         kd.gws1 = global[1];
 208         kd.gws2 = global[2];
 209
 210         kd.lws0 = local[0];
 211         kd.lws1 = local[1];
 212         kd.lws2 = local[2];
 213
 214         kd.cldnnStyle = { 0 };
 215         kd.gemmStyle  = { 0 };
 216         kd.effiency   = FORCE_PRIORITY_1;
 217
 218         return kd;
 219
 220     } // SetDefault
 221
 222     bool
 223     ConvolutionKernel_imad_3x3::Validate(
 224             const Params&          params,
 225             const optional_params& options) const
 226     {
 227         if (!Parent::Validate(params, options))
 228         {
 229             return false;
 230         }
 231
 232         KernelData kd = KernelData::Default<convolution_params>(params);
 233         convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
 234
 235         if (newParams.stride.x != newParams.stride.y) {
 236             // Strides must be equial
 237             return false;
 238         }
 239         else if ((newParams.filterSize.x != m_FilterSizeX) ||
 240                  (newParams.filterSize.y != m_FilterSizeY)) {
 241             // Kernel does not support such filter size
 242             return false;
 243         }
 244         else {
 245             const auto& in = newParams.inputs[0];
 246             const auto& iDims = in.GetDims();
 247             const int iX = DataTensor::Channelndex(
 248                 in.GetLayout(), Tensor::DataChannelName::X);
 249             if (iDims[iX].v % OUT_BLOCK_THRESHOLD != 0) {
 250                 // Input size must be multiple of OUT_BLOCK_THRESHOLD
 251                 return false;
 252             }
 253         }
 254
 255         return true;
 256     }
 257
 258     KernelsData
 259     ConvolutionKernel_imad_3x3::GetCommonKernelsData(
 260                                 const Params&          params,
 261                                 const optional_params& options,
 262                                 const std::string      exeMode,
 263                                 int                    autoTuneIndex) const
 264     {
 265         if (!Validate(params, options))
 266         {
 267             return{};
 268         }
 269
 270         KernelData kd = KernelData::Default<convolution_params>(params);
 271         convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
 272         DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
 273         if (!CheckWorkGroups(runInfo))
 274         {
 275             // Internal Error - wrong calculation of global/local work group sizes
 276             return{};
 277         }
 278
 279         bool succeed = UpdateWeightsParams(
 280             newParams,
 281             options,
 282             GetSupportedWeightLayouts(newParams),
 283             kd.weightsReorderParams,
 284             GetSupportedKey());
 285
 286         if (!succeed)
 287         {
 288             return{};
 289         }
 290
 291         auto finalKernelName = GetKernelName(newParams);
 292         auto cldnnJit = GetJitConstants(newParams, runInfo);
 293         auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
 294         auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
 295
 296         auto& kernel = kd.kernels[0];
 297         FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.int8_quantization, newParams.output_calibration);
 298
 299         kd.estimatedTime = runInfo.effiency;
 300         kd.autoTuneIndex = autoTuneIndex;
 301
 302         return{ kd };
 303
 304     } // GetCommonKernelsData
 305 }