inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "convolution_kernel_MMAD_blocks.h"
  18
  19 namespace kernel_selector
  20 {
  21     ConvolutionKernel_MMAD_blocks::ConvolutionKernel_MMAD_blocks() : ConvolutionKernelBase("convolution_gpu_mmad_blocks")
  22     {
  23         // Generate the dispatch options to the auto-tuner.
  24         std::vector<size_t> blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
  25         std::vector<size_t> blockHeightSizes = { 1,2,3,4,5,6,7,8,9,10 };
  26         std::vector<size_t> prefetchSizes = { 1,2,3,4,5,6,8,10 };
  27         std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
  28         const size_t maxBlockSize = 240;
  29         for (auto executionMode : executionModes)
  30         {
  31             for (auto blockWidth : blockWidthSizes)
  32             {
  33                 for (auto blockHeight : blockHeightSizes)
  34                 {
  35                     for (auto prefetch : prefetchSizes)
  36                     {
  37                         if (blockWidth * blockHeight <= maxBlockSize)
  38                         {
  39                             autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode });
  40                         }
  41
  42                     }
  43                 }
  44             }
  45         }
  46     }
  47
  48     ParamsKey ConvolutionKernel_MMAD_blocks::GetSupportedKey() const
  49     {
  50         ParamsKey k;
  51         k.EnableInputDataType(Datatype::INT8);
  52         k.EnableOutputDataType(Datatype::INT8);
  53         k.EnableInputWeightsType(WeightsType::INT8);
  54         k.EnableInputLayout(DataLayout::byxf_af32);
  55         k.EnableOutputLayout(DataLayout::byxf_af32);
  56         k.EnableTensorOffset();
  57         k.EnableTensorPitches();
  58         k.EnableDilation();
  59         k.EnableBiasPerFeature();
  60         k.EnableBiasPerOutput();
  61         k.EnableNonBiasTerm();
  62         k.EnableBatching();
  63         k.EnableSplitSupport();
  64         k.EnableInt8Quantization();
  65         k.EnableOutputCalibration();
  66         k.DisableTuning();
  67         return k;
  68     }
  69
  70     bool ConvolutionKernel_MMAD_blocks::Validate(const Params& p, const optional_params& o) const
  71     {
  72         if (!Parent::Validate(p, o))
  73         {
  74             return false;
  75         }
  76
  77         const convolution_params& params = static_cast<const convolution_params&>(p);
  78
  79         // this kernel is designed for quantization use case
  80         if (!params.int8_quantization)
  81             return false;
  82
  83         return true;
  84     }
  85
  86     static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y)
  87     {
  88         // how many elements we will compute in each dimension
  89         size_t computed_x = Align(output_x, block_x);
  90         size_t computed_y = Align(output_y, block_y);
  91         // how many simds we need in each dimension
  92         size_t simds_x = computed_x / block_x;
  93         size_t simds_y = computed_y / block_y;
  94         // how many unused values we have in each dimension
  95         size_t unused_x = computed_x - output_x;
  96         size_t unused_y = computed_y - output_y;
  97
  98         block_x -= unused_x / simds_x;
  99         block_y -= unused_y / simds_y;
 100     }
 101
 102     ConvolutionKernel_MMAD_blocks::AutoTuneOption ConvolutionKernel_MMAD_blocks::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const
 103     {
 104         if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
 105         {
 106             return autoTuneOptions[autoTuneIndex];
 107         }
 108
 109         // Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
 110         constexpr size_t sub_group_size = 16;
 111
 112         AutoTuneOption option = { 0, 0, 0, DEFAULT };
 113
 114         const convolution_params& cp = static_cast<const convolution_params&>(p);
 115
 116         if (cp.stride.x == 1 && cp.stride.y == 1)
 117         {
 118             if (cp.filterSize.x == 1 && cp.filterSize.y == 1)
 119             {
 120                 option.blockWidth = 16;
 121                 option.blockHeight = 1;
 122                 option.prefetch = 4;
 123             }
 124             //if less than 16 values is required to compute one single row of output
 125             //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results)
 126             else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size)
 127             {
 128                 option.blockWidth = cp.output.X().v;
 129                 option.blockHeight = 1;
 130                 option.prefetch = 4;
 131             }
 132             else if (cp.filterSize.x < 5 && cp.filterSize.y < 5)
 133             {
 134                 option.blockWidth = sub_group_size - cp.filterSize.x + 1;
 135                 option.blockHeight = 2;
 136                 option.prefetch = 4;
 137             }
 138             else
 139             {
 140                 option.blockWidth = 4;
 141                 option.blockHeight = 3;
 142                 option.prefetch = 4;
 143             }
 144         }
 145         else if (cp.stride.x == 2 && cp.stride.y == 2)
 146         {
 147             option.blockWidth = 5;
 148             option.blockHeight = 4;
 149             option.prefetch = 4;
 150         }
 151         else
 152         {
 153             option.blockWidth = 4;
 154             option.blockHeight = 3;
 155             option.prefetch = 5;
 156             //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better
 157         }
 158
 159         // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes
 160         if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1)
 161         {
 162             shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v,
 163                 option.blockWidth, option.blockHeight);
 164         }
 165
 166         return option;
 167     }
 168
 169     static std::pair<size_t, size_t> get_byxf_af32_req_input_block_dims(
 170         size_t output_block_width,
 171         size_t output_block_height,
 172         const uSize& filter_size,
 173         const uSize& stride,
 174         const uSize& dilation,
 175         size_t sub_group_size = 8,
 176         size_t read_chunk_size = 8,
 177         size_t min_read_size = 8)
 178     {
 179         assert(output_block_width > 0 && output_block_height > 0);
 180         assert(stride.x > 0 && stride.y > 0);
 181         assert(filter_size.x > 0 && filter_size.y > 0);
 182
 183         // Number of elements in X dimension needed from input to compute output block without re-reading input.
 184         size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1;
 185         // Number of elements in Y dimension needed from input to compute output block without re-reading input.
 186         size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1;
 187
 188         // Required number of elements in X dimension rounded to nearest >= read chunk size.
 189         size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
 190         // Number of sub-group-sized vectors of unit type needed to store input block.
 191         size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sub_group_size);
 192
 193         // size of our array per workitem
 194         input_block_array_size = input_block_req_height * input_block_read_width;
 195         return std::make_pair(input_block_array_size, input_block_read_width);
 196     }
 197
 198     ConvolutionKernelBase::DispatchData ConvolutionKernel_MMAD_blocks::SetDefault(const convolution_params& cp, int autoTuneIndex) const
 199     {
 200         // Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
 201         constexpr size_t sub_group_size = 8;
 202
 203         DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
 204
 205         auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
 206         runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
 207         runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
 208         runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
 209
 210         auto input_block_dims = get_byxf_af32_req_input_block_dims(
 211             runInfo.cldnnStyle.blockWidth,
 212             runInfo.cldnnStyle.blockHeight,
 213             cp.filterSize,
 214             cp.stride,
 215             cp.dilation,
 216             sub_group_size,
 217             runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
 218             sub_group_size);
 219         runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
 220         runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
 221
 222
 223         const auto of_maps = cp.output.Feature().v;
 224         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
 225
 226         runInfo.effiency = FORCE_PRIORITY_3;
 227
 228         runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
 229         runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
 230         runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
 231
 232         runInfo.lws0 = 1;
 233         runInfo.lws1 = 1;
 234         runInfo.lws2 = sub_group_size;
 235
 236         return runInfo;
 237     }
 238
 239     JitConstants ConvolutionKernel_MMAD_blocks::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const
 240     {
 241         auto jit = Parent::GetJitConstants(params, runInfo);
 242
 243         jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
 244         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
 245         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
 246         jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
 247         jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
 248         jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
 249
 250         // pitch for special block format used in this kernel
 251         const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
 252         const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
 253         jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
 254         return jit;
 255     }
 256
 257     KernelsData ConvolutionKernel_MMAD_blocks::GetKernelsData(const Params& params, const optional_params& options) const
 258     {
 259         KernelsData kd = GetTunedKernelsDataByIndex(params, options);
 260         if (!kd.empty())
 261             kd[0].estimatedTime = FORCE_PRIORITY_2;
 262
 263         return kd;
 264     }
 265
 266     KernelsData ConvolutionKernel_MMAD_blocks::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
 267     {
 268         if (!Validate(params, options))
 269         {
 270             return{};
 271         }
 272
 273         KernelsData res = {};
 274
 275         for (size_t i = 0; i < autoTuneOptions.size(); i++)
 276         {
 277             KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
 278             if (!kd.empty())
 279             {
 280                 res.emplace_back(kd[0]);
 281             }
 282         }
 283
 284         return res;
 285     }
 286 }