inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp

   1 /*
   2 // Copyright (c) 2018 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h"
  18 #include "kernel_selector_utils.h"
  19
  20 namespace kernel_selector {
  21
  22     static const size_t _SG_TILE_M = 32;
  23     static const size_t _SG_TILE_N = 32;
  24     static const size_t _SG_SIZE = 8; // sub group size
  25     static const size_t _TILES_PER_SG_X = 1; // Persistent threads
  26     static const size_t _TILES_PER_SG_Y = 1; // Persistent threads
  27
  28         ParamsKey ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const
  29         {
  30                 ParamsKey k;
  31                 k.EnableInputDataType(Datatype::INT8);
  32                 k.EnableOutputDataType(Datatype::INT8);
  33                 k.EnableInputWeightsType(WeightsType::INT8);
  34                 k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
  35                 k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
  36                 k.EnableTensorOffset();
  37                 k.EnableTensorPitches();
  38                 k.EnableBiasPerFeature();
  39                 k.EnableBatching();
  40                 k.EnableInt8Quantization();
  41                 k.EnableOutputCalibration();
  42                 k.DisableTuning();
  43                 return k;
  44         }
  45
  46         bool ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const
  47         {
  48                 if (!ConvolutionKernelBase::Validate(p, o) ||
  49                         !CovolutionCheckInput(p, o))
  50                 {
  51                         return false;
  52                 }
  53
  54                 const convolution_params& cp = static_cast<const convolution_params&>(p);
  55
  56         // make sure it's 1x1 conv
  57         if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
  58             return false;
  59
  60         // make sure stride is 1x1
  61         if (cp.stride.x != 1 || cp.stride.y != 1)
  62             return false;
  63
  64         // input padding not supported
  65         if (cp.inputs[0].X().pad.Total() != 0 ||
  66             cp.inputs[0].Y().pad.Total() != 0 ||
  67             cp.inputs[0].Feature().pad.Total() != 0 ||
  68             cp.inputs[0].Batch().pad.Total() != 0)
  69             return false;
  70
  71         // input and output spatial sizes must match
  72         if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
  73             return false;
  74
  75                 const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ;
  76                 const auto k = cp.inputs[0].Feature().v;
  77                 const auto n = cp.output.Feature().v ;
  78
  79                 if (m % 32 != 0 && m % 128 != 0)  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128
  80                         return false;
  81
  82                 if (k % 32 != 0)  // Matrix size K, Must be mutliple of 32
  83                         return false;
  84
  85                 if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128
  86                         return false;
  87
  88                 return true;
  89         }
  90
  91
  92         ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault(const convolution_params& arg, int) const
  93         {
  94                 DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
  95
  96                 runInfo.effiency = FORCE_PRIORITY_1;
  97
  98                 size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
  99                 size_t mat_n = arg.output.Feature().v;
 100
 101                 size_t _MATRIX_M = mat_m;
 102                 size_t _MATRIX_N = mat_n;
 103
 104                 size_t _WG_TILE_M = 128;
 105                 size_t _WG_TILE_N = 128;
 106
 107                 // Calculate number of threads needed
 108                 const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
 109                 const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y  ;
 110
 111                 // Define execution setup for kernel:
 112                 size_t globalWorkSize[3] = { threadsX, threadsY, 1 };
 113                 size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 };
 114
 115                 runInfo.gws0 = globalWorkSize[0];
 116                 runInfo.gws1 = globalWorkSize[1];
 117                 runInfo.gws2 = globalWorkSize[2];
 118
 119                 runInfo.lws0 = localWorkSize[0];
 120                 runInfo.lws1 = localWorkSize[1];
 121                 runInfo.lws2 = localWorkSize[2];
 122
 123                 return runInfo;
 124         }
 125
 126         JitConstants ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const
 127         {
 128                 auto jit = Parent::GetJitConstants(params, runInfo);
 129
 130                 jit.AddConstant(MakeJitConstant("WG_TILE_M", 128));          // Work-Group tile size M, Must be mutliple of 32
 131                 jit.AddConstant(MakeJitConstant("WG_TILE_N", 128));          // Work-Group tile size N, Must be mutliple of 32
 132                 jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1));       // Persistent threads
 133                 jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1));       // Persistent threads
 134
 135                 // Do not change values below
 136         jit.AddConstant(MakeJitConstant("DIM_X", 0));
 137         jit.AddConstant(MakeJitConstant("DIM_Y", 1));
 138         jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
 139         jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
 140         jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
 141         jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
 142         jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
 143         jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
 144         jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
 145         jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
 146
 147                 jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
 148                 jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
 149                 jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
 150
 151                 const auto& input = params.inputs[0];
 152                 const auto& output = params.output;
 153
 154                 auto m = output.X().v * output.Y().v * output.Batch().v;
 155                 auto k = input.Feature().v;
 156                 auto n = output.Feature().v;
 157
 158                 jit.AddConstant(MakeJitConstant("MATRIX_M", m));
 159                 jit.AddConstant(MakeJitConstant("MATRIX_K", k));
 160                 jit.AddConstant(MakeJitConstant("MATRIX_N", n));
 161
 162         const size_t out_x_pitch = 32 * 4;
 163         const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
 164         const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
 165         const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
 166         const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
 167
 168         jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
 169         jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
 170         jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
 171         jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
 172         jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
 173
 174         bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0;
 175         jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding));
 176
 177                 return jit;
 178         }
 179
 180         KernelsData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const
 181         {
 182                 KernelsData kd = GetCommonKernelsData(params, options);
 183                 if (!kd.empty())
 184                         kd[0].estimatedTime = FORCE_PRIORITY_1; //_3
 185                 return kd;
 186         }
 187 }