inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h"
  18
  19 namespace kernel_selector
  20 {
  21     // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
  22     constexpr size_t sub_group_size = 16;
  23
  24     ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16_2_sg")
  25     {
  26         // Generate the dispatch options to the auto-tuner.
  27         std::vector<size_t> blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 };
  28         std::vector<size_t> blockHeightSizes = { 1,2,3,4,5 };
  29         std::vector<size_t> prefetchSizes = { 1,2,3,4,5,6,8,10 };
  30         std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
  31         const size_t maxBlockSize = 60;
  32
  33         for (auto executionMode : executionModes)
  34         {
  35             for (auto blockWidth : blockWidthSizes)
  36             {
  37                 for (auto blockHeight : blockHeightSizes)
  38                 {
  39                     for (auto prefetch : prefetchSizes)
  40                     {
  41                             if (blockWidth * blockHeight <= maxBlockSize)
  42                             {
  43                                 autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode });
  44                             }
  45                     }
  46                 }
  47             }
  48         }
  49     }
  50
  51     ParamsKey ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedKey() const
  52     {
  53         ParamsKey k;
  54         k.EnableInputDataType(Datatype::F16);
  55         k.EnableInputDataType(Datatype::F32);
  56         k.EnableInputWeightsType(WeightsType::F16);
  57         k.EnableInputWeightsType(WeightsType::F32);
  58         k.EnableOutputDataType(Datatype::F16);
  59         k.EnableOutputDataType(Datatype::F32);
  60         k.EnableInputLayout(DataLayout::bfyx);
  61         k.EnableOutputLayout(DataLayout::bfyx);
  62         k.EnableTensorOffset();
  63         k.EnableTensorPitches();
  64         k.EnableSubGroup();
  65         k.EnableBiasPerFeature();
  66         k.EnableBiasPerOutput();
  67         k.EnableNonBiasTerm();
  68         k.EnableBatching();
  69         k.EnableSplitSupport();
  70         k.EnableDilation();
  71         k.EnableTranspose();
  72         return k;
  73     }
  74
  75     static std::pair<size_t, size_t> get_bfyx_req_input_block_dims(
  76         size_t output_block_width,
  77         size_t output_block_height,
  78         const uSize& filter_size,
  79         const uSize& stride,
  80         const uSize& dilation,
  81         size_t sg_size = 16,
  82         size_t read_chunk_size = 8,
  83         size_t min_read_size = 16)
  84     {
  85         assert(output_block_width > 0 && output_block_height > 0);
  86         assert(stride.x > 0 && stride.y > 0);
  87         assert(filter_size.x > 0 && filter_size.y > 0);
  88
  89         // Number of elements in X dimension needed from input to compute output block without re-reading input.
  90         size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1;
  91         // Number of elements in Y dimension needed from input to compute output block without re-reading input.
  92         size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1;
  93
  94         // Required number of elements in X dimension rounded to nearest >= read chunk size.
  95         size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
  96         // Number of sub-group-sized vectors of unit type needed to store input block.
  97         size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size);
  98
  99         return std::make_pair(input_block_array_size, input_block_read_width);
 100     }
 101
 102     static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y)
 103     {
 104         // how many elements we will compute in each dimension
 105         size_t computed_x = Align(output_x, block_x);
 106         size_t computed_y = Align(output_y, block_y);
 107         // how many simds we need in each dimension
 108         size_t simds_x = computed_x / block_x;
 109         size_t simds_y = computed_y / block_y;
 110         // how many unused values we have in each dimension
 111         size_t unused_x = computed_x - output_x;
 112         size_t unused_y = computed_y - output_y;
 113
 114         block_x -= unused_x / simds_x;
 115         block_y -= unused_y / simds_y;
 116     }
 117
 118     ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const
 119     {
 120         if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
 121         {
 122             return autoTuneOptions[autoTuneIndex];
 123         }
 124
 125         AutoTuneOption option = { 0, 0, 0, DEFAULT };
 126
 127         const convolution_params& cp = static_cast<const convolution_params&>(p);
 128
 129         if (cp.stride.x == 1 && cp.stride.y == 1)
 130         {
 131             if (cp.filterSize.x == 1 && cp.filterSize.y == 1)
 132             {
 133                 option.blockWidth = 16;
 134                 option.blockHeight = 1;
 135                 option.prefetch = 4;
 136             }
 137             //if less than 16 values is required to compute one single row of output
 138             //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results)
 139             else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size)
 140             {
 141                 option.blockWidth = cp.output.X().v;
 142                 option.blockHeight = 1;
 143                 option.prefetch = 4;
 144             }
 145             else if (cp.filterSize.x < 5 && cp.filterSize.y < 5)
 146             {
 147                 option.blockWidth = sub_group_size - cp.filterSize.x + 1;
 148                 option.blockHeight = 2;
 149                 option.prefetch = 4;
 150             }
 151             else
 152             {
 153                 option.blockWidth = 4;
 154                 option.blockHeight = 3;
 155                 option.prefetch = 4;
 156             }
 157         }
 158         else if (cp.stride.x == 2 && cp.stride.y == 2)
 159         {
 160             option.blockWidth = 5;
 161             option.blockHeight = 4;
 162             option.prefetch = 4;
 163         }
 164         else
 165         {
 166             option.blockWidth = 4;
 167             option.blockHeight = 3;
 168             option.prefetch = 5;
 169             //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better
 170         }
 171
 172         // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes
 173         if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1)
 174         {
 175             shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v,
 176                 option.blockWidth, option.blockHeight);
 177         }
 178
 179         return option;
 180     }
 181
 182     ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::SetDefault(const convolution_params& cp, int autoTuneIndex) const
 183     {
 184         DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
 185
 186         const auto of_maps = cp.output.Feature().v;
 187         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
 188
 189         runInfo.effiency = FORCE_PRIORITY_3;
 190
 191         auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
 192         runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
 193         runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
 194         runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
 195
 196         auto input_block_dims = get_bfyx_req_input_block_dims(
 197             runInfo.cldnnStyle.blockWidth,
 198             runInfo.cldnnStyle.blockHeight,
 199             cp.filterSize,
 200             cp.stride,
 201             cp.dilation,
 202             sub_group_size,
 203             runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
 204             sub_group_size);
 205         runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
 206         runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
 207
 208         runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
 209         runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
 210         runInfo.gws2 = 2 * of_threads_per_batch * cp.output.Batch().v;
 211
 212         runInfo.lws0 = 1;
 213         runInfo.lws1 = 1;
 214         runInfo.lws2 = 2*sub_group_size;
 215
 216         return runInfo;
 217     }
 218
 219     bool ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::Validate(const Params& p, const optional_params& o) const
 220     {
 221         if (!ConvolutionKernelBase::Validate(p, o) ||
 222             !CovolutionCheckInput(p, o))
 223         {
 224             return false;
 225         }
 226
 227         const convolution_params& cp = static_cast<const convolution_params&>(p);
 228
 229         if (cp.inputs[0].Feature().v % 2 != 0 || cp.inputs[0].Feature().v < 64)
 230             return false;
 231
 232         if (cp.output.Feature().v % 64 != 0)
 233             return false;
 234
 235         return true;
 236     }
 237
 238     JitConstants ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const
 239     {
 240         const auto of_maps = params.output.Feature().v;
 241         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
 242         size_t leftovers = of_threads_per_batch - of_maps;
 243
 244         auto jit = Parent::GetJitConstants(params, runInfo);
 245
 246         jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 16));
 247         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
 248         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
 249         jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
 250         jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
 251         jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
 252
 253         if (leftovers)
 254         {
 255             jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
 256         }
 257
 258         return jit;
 259     }
 260
 261     std::vector<WeightsLayout> ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedWeightLayouts(const convolution_params& params) const
 262     {
 263         if (!params.transposed)
 264         {
 265             return{ WeightsLayout::os_iyx_osv16 };
 266         }
 267         else
 268         {
 269             return{ WeightsLayout::os_iyx_osv16_rotate_180 };
 270         }
 271     }
 272
 273     KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsData(const Params& params, const optional_params& options) const
 274     {
 275         return GetTunedKernelsDataByIndex(params, options);
 276     }
 277
 278     KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
 279     {
 280         if (!Validate(params, options))
 281         {
 282             return{};
 283         }
 284
 285         KernelsData res = {};
 286
 287         for (size_t i = 0; i < autoTuneOptions.size(); i++)
 288         {
 289             KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
 290             if (!kd.empty())
 291             {
 292                 res.emplace_back(kd[0]);
 293             }
 294         }
 295
 296         return res;
 297     }
 298
 299 }