inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp

   1 /*
   2 // Copyright (c) 2018 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h"
  18
  19 namespace kernel_selector
  20 {
  21     // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
  22     constexpr size_t sub_group_size = 16;
  23
  24     fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::fused_conv_eltwise_kernel_bfyx_os_iyx_osv16() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_bfyx_os_iyx_osv16")
  25     {
  26         // Generate the dispatch options to the auto-tuner.
  27         std::vector<size_t> blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 };
  28         std::vector<size_t> blockHeightSizes = { 1,2,3,4,5 };
  29         std::vector<size_t> prefetchSizes = { 1,2,3,4,5,6,8,10 };
  30         std::vector<std::string> executionModes = fused_conv_eltwise_kernel_base::autoTuneOptions;
  31         const size_t maxBlockSize = 60;
  32
  33         for (auto executionMode : executionModes)
  34         {
  35             for (auto blockWidth : blockWidthSizes)
  36             {
  37                 for (auto blockHeight : blockHeightSizes)
  38                 {
  39                     for (auto prefetch : prefetchSizes)
  40                     {
  41                             if (blockWidth * blockHeight <= maxBlockSize)
  42                             {
  43                                 autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode });
  44                             }
  45                     }
  46                 }
  47             }
  48         }
  49     }
  50
  51     ParamsKey fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetSupportedKey() const
  52     {
  53         ParamsKey k;
  54         k.EnableInputDataType(Datatype::F16);
  55         k.EnableInputDataType(Datatype::F32);
  56         k.EnableInputWeightsType(WeightsType::F16);
  57         k.EnableInputWeightsType(WeightsType::F32);
  58         k.EnableOutputDataType(Datatype::F16);
  59         k.EnableOutputDataType(Datatype::F32);
  60         k.EnableInputLayout(DataLayout::bfyx);
  61         k.EnableOutputLayout(DataLayout::bfyx);
  62         k.EnableTensorOffset();
  63         k.EnableTensorPitches();
  64         k.EnableSubGroup();
  65         k.EnableBiasPerFeature();
  66         k.EnableBiasPerOutput();
  67         k.EnableNonBiasTerm();
  68         k.EnableBatching();
  69         k.EnableFusedConvEltwSplitSupport();
  70         k.EnableFusedConvEltwDilation();
  71         k.EnableFusedConvEltwTranspose();
  72         k.EnableFusedConvEltwiseRWOutOpt(); // data for second input are already in output
  73         return k;
  74     }
  75
  76     static std::pair<size_t, size_t> get_bfyx_req_input_block_dims(
  77         size_t output_block_width,
  78         size_t output_block_height,
  79         const uSize& filter_size,
  80         const uSize& stride,
  81         const uSize& dilation,
  82         size_t sg_size = 16,
  83         size_t read_chunk_size = 8,
  84         size_t min_read_size = 16)
  85     {
  86         assert(output_block_width > 0 && output_block_height > 0);
  87         assert(stride.x > 0 && stride.y > 0);
  88         assert(filter_size.x > 0 && filter_size.y > 0);
  89
  90         // Number of elements in X dimension needed from input to compute output block without re-reading input.
  91         size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1;
  92         // Number of elements in Y dimension needed from input to compute output block without re-reading input.
  93         size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1;
  94
  95         // Required number of elements in X dimension rounded to nearest >= read chunk size.
  96         size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
  97         // Number of sub-group-sized vectors of unit type needed to store input block.
  98         size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size);
  99
 100         return std::make_pair(input_block_array_size, input_block_read_width);
 101     }
 102
 103     static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y)
 104     {
 105         // how many elements we will compute in each dimension
 106         size_t computed_x = Align(output_x, block_x);
 107         size_t computed_y = Align(output_y, block_y);
 108         // how many simds we need in each dimension
 109         size_t simds_x = computed_x / block_x;
 110         size_t simds_y = computed_y / block_y;
 111         // how many unused values we have in each dimension
 112         size_t unused_x = computed_x - output_x;
 113         size_t unused_y = computed_y - output_y;
 114
 115         block_x -= unused_x / simds_x;
 116         block_y -= unused_y / simds_y;
 117     }
 118
 119     fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::AutoTuneOption fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const
 120     {
 121         if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
 122         {
 123             return autoTuneOptions[autoTuneIndex];
 124         }
 125
 126         AutoTuneOption option = { 0, 0, 0, DEFAULT };
 127
 128         const convolution_params& cp = static_cast<const convolution_params&>(p);
 129
 130         if (cp.stride.x == 1 && cp.stride.y == 1)
 131         {
 132             if (cp.filterSize.x == 1 && cp.filterSize.y == 1)
 133             {
 134                 option.blockWidth = 16;
 135                 option.blockHeight = 1;
 136                 option.prefetch = 4;
 137             }
 138             //if less than 16 values is required to compute one single row of output
 139             //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results)
 140             else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size)
 141             {
 142                 option.blockWidth = cp.output.X().v;
 143                 option.blockHeight = 1;
 144                 option.prefetch = 4;
 145             }
 146             else if (cp.filterSize.x < 5 && cp.filterSize.y < 5)
 147             {
 148                 option.blockWidth = sub_group_size - cp.filterSize.x + 1;
 149                 option.blockHeight = 2;
 150                 option.prefetch = 4;
 151             }
 152             else
 153             {
 154                 option.blockWidth = 4;
 155                 option.blockHeight = 3;
 156                 option.prefetch = 4;
 157             }
 158         }
 159         else if (cp.stride.x == 2 && cp.stride.y == 2)
 160         {
 161             option.blockWidth = 5;
 162             option.blockHeight = 4;
 163             option.prefetch = 4;
 164         }
 165         else
 166         {
 167             option.blockWidth = 4;
 168             option.blockHeight = 3;
 169             option.prefetch = 5;
 170             //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better
 171         }
 172
 173         // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes
 174         if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1)
 175         {
 176             shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v,
 177                 option.blockWidth, option.blockHeight);
 178         }
 179
 180         return option;
 181     }
 182
 183     fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::SetDefault(const fused_conv_eltwise_params& cp, int autoTuneIndex) const
 184     {
 185         DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp);
 186
 187         const auto of_maps = cp.output.Feature().v;
 188         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
 189
 190         runInfo.effiency = FORCE_PRIORITY_3;
 191
 192         auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
 193         runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
 194         runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
 195         runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
 196
 197         auto input_block_dims = get_bfyx_req_input_block_dims(
 198             runInfo.cldnnStyle.blockWidth,
 199             runInfo.cldnnStyle.blockHeight,
 200             cp.conv.filterSize,
 201             cp.conv.stride,
 202             cp.conv.dilation,
 203             sub_group_size,
 204             runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
 205             sub_group_size);
 206         runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
 207         runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
 208
 209         runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
 210         runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
 211         runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
 212
 213         runInfo.lws0 = 1;
 214         runInfo.lws1 = 1;
 215         runInfo.lws2 = sub_group_size;
 216
 217         return runInfo;
 218     }
 219
 220     bool fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::Validate(const Params& p, const optional_params& o) const
 221     {
 222         if (!fused_conv_eltwise_kernel_base::Validate(p, o) ||
 223             !FusedConvolutionEltwiseCheckInput(p, o))
 224         {
 225             return false;
 226         }
 227
 228         return true;
 229     }
 230
 231     JitConstants fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const
 232     {
 233         const auto of_maps = params.output.Feature().v;
 234         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
 235         size_t leftovers = of_threads_per_batch - of_maps;
 236
 237         auto jit = Parent::GetJitConstants(params, runInfo);
 238
 239         jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
 240         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
 241         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
 242         jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
 243         jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
 244         jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
 245
 246         if (leftovers)
 247         {
 248             jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
 249         }
 250
 251         if (!params.eltw.stride.empty())
 252         {
 253             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x));
 254             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y));
 255         }
 256         else
 257         {
 258             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1));
 259             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1));
 260         }
 261
 262         return jit;
 263     }
 264
 265     std::vector<WeightsLayout> fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetSupportedWeightLayouts(const fused_conv_eltwise_params& params) const
 266     {
 267         if (!params.conv.transposed)
 268         {
 269             return{ WeightsLayout::os_iyx_osv16 };
 270         }
 271         else
 272         {
 273             return{ WeightsLayout::os_iyx_osv16_rotate_180 };
 274         }
 275     }
 276
 277     KernelsData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetKernelsData(const Params& params, const optional_params& options) const
 278     {
 279         return GetTunedKernelsDataByIndex(params, options);
 280     }
 281
 282     KernelsData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
 283     {
 284         if (!Validate(params, options))
 285         {
 286             return{};
 287         }
 288
 289         KernelsData res = {};
 290
 291         for (size_t i = 0; i < autoTuneOptions.size(); i++)
 292         {
 293             KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
 294             if (!kd.empty())
 295             {
 296                 res.emplace_back(kd[0]);
 297             }
 298         }
 299
 300         return res;
 301     }
 302
 303 }