inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_bfyx_f16.cpp

   1 // Copyright (c) 2016-2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "convolution_kernel_bfyx_to_bfyx_f16.h"
  17 #include "kernel_selector_utils.h"
  18 #include <vector>
  19 #include <algorithm>
  20
  21 namespace kernel_selector {
  22
  23 static const size_t sub_group_size = 16;
  24 static const size_t feature_block_size = 16;
  25
  26 ConvolutionKernel_bfyx_to_bfyx_f16::ConvolutionKernel_bfyx_to_bfyx_f16()
  27     : ConvolutionKernelBase("convolution_gpu_bfyx_to_bfyx_f16") {
  28     std::vector<size_t> outputBlockWidths = {2, 4, 8};
  29     std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
  30
  31     for (auto w : outputBlockWidths) {
  32         for (auto exeMode : executionModes) {
  33             autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
  34         }
  35     }
  36 }
  37
  38 ConvolutionKernel_bfyx_to_bfyx_f16::AutoTuneOption ConvolutionKernel_bfyx_to_bfyx_f16::GetAutoTuneOptions(
  39     const Params& /* arg*/,
  40     int autoTuneIndex) const {
  41     if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
  42         return autoTuneOptions[autoTuneIndex];
  43
  44     return {8, AGE_BASED};
  45 }
  46
  47 ParamsKey ConvolutionKernel_bfyx_to_bfyx_f16::GetSupportedKey() const {
  48     ParamsKey k;
  49     k.EnableInputDataType(Datatype::F16);
  50     k.EnableInputDataType(Datatype::F32);
  51     k.EnableOutputDataType(Datatype::F16);
  52     k.EnableOutputDataType(Datatype::F32);
  53     k.EnableInputWeightsType(WeightsType::F16);
  54     k.EnableInputWeightsType(WeightsType::F32);
  55     k.EnableInputLayout(DataLayout::bfyx);
  56     k.EnableOutputLayout(DataLayout::bfyx_f16);
  57     k.EnableTensorOffset();
  58     k.EnableTensorPitches();
  59     // TODO Add dilation support to kernel
  60     // k.EnableDilation();
  61     k.EnableBiasPerFeature();
  62     // TODO Add bias per output support to kernel
  63     // k.EnableBiasPerOutput();
  64     k.EnableNonBiasTerm();
  65     k.EnableSplitSupport();
  66     k.EnableDepthwiseSeparableOpt();
  67     k.EnableBatching();
  68     k.EnableSubGroup();
  69     k.EnableSubGroupShort();
  70     return k;
  71 }
  72
  73 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_to_bfyx_f16::SetDefault(const convolution_params& params,
  74                                                                                    int autoTuneIndex) const {
  75     DispatchData kd = ConvolutionKernelBase::SetDefault(params);
  76
  77     const auto& out = params.output;
  78
  79     auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
  80     kd.cldnnStyle.blockWidth = autoTune.blockWidth;
  81
  82     auto x = out.X().v;
  83     auto y = out.Y().v;
  84     auto f = out.Feature().v;
  85     auto b = out.Batch().v;
  86
  87     kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
  88     kd.gws1 = Align(f, sub_group_size);
  89     kd.gws2 = b;
  90
  91     kd.lws0 = 1;
  92     kd.lws1 = sub_group_size;
  93     kd.lws2 = 1;
  94
  95     if (b == 1)
  96         kd.effiency = FORCE_PRIORITY_2;
  97     else
  98         kd.effiency = FORCE_PRIORITY_7;
  99
 100     return kd;
 101 }
 102
 103 bool ConvolutionKernel_bfyx_to_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
 104     if (!ConvolutionKernelBase::Validate(p, o)) {
 105         return false;
 106     }
 107
 108     const auto& params = static_cast<const convolution_params&>(p);
 109
 110     const auto& input = params.inputs[0];
 111     const auto& output = params.output;
 112
 113     // TODO Add support for different input features number in kernel
 114     if (input.Feature().v != 3) {
 115         return false;
 116     }
 117
 118     // Check that padding before features doesn't miss-align the blocks
 119     if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0) {
 120         return false;
 121     }
 122
 123     return true;
 124 }
 125
 126 JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetJitConstants(const convolution_params& params,
 127                                                                  const DispatchData& runInfo) const {
 128     auto input = params.inputs[0];
 129     auto output = params.output;
 130     auto jit = Parent::GetJitConstants(params, runInfo);
 131
 132     auto blockWidth = runInfo.cldnnStyle.blockWidth;
 133
 134     size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1,
 135                                       input.X().v + input.X().pad.Total());
 136     size_t input_block_size = CeilDiv(input_line_size * params.filterSize.y, sub_group_size);
 137
 138     jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
 139
 140     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
 141     jit.AddConstant(MakeJitConstant("INPUT_BLOCK_SIZE", input_block_size));
 142
 143     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
 144     jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
 145
 146     if (params.output.Feature().v % feature_block_size != 0) {
 147         jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
 148     }
 149
 150     return jit;
 151 }
 152
 153 KernelsData ConvolutionKernel_bfyx_to_bfyx_f16::GetTunedKernelsDataByIndex(const Params& params,
 154                                                                            const optional_params& options,
 155                                                                            const int autoTuneIndex) const {
 156     auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
 157     return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
 158 }
 159
 160 KernelsData ConvolutionKernel_bfyx_to_bfyx_f16::GetKernelsData(const Params& params,
 161                                                                const optional_params& options) const {
 162     return GetTunedKernelsDataByIndex(params, options);
 163 }
 164
 165 JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetFusedPrimitivesJitConstants(const convolution_params& params,
 166                                                                                 const DispatchData& kd) const {
 167     JitConstants jit = {};
 168
 169     size_t op_id = 0;
 170     std::string input_decls = "";
 171     std::string load_decls_vec = "";
 172     std::string load_decls = "";
 173     std::string eltwise_fused_ops_vec = "";
 174     std::string eltwise_fused_ops = "";
 175
 176     auto make_jit_vector_type = [](std::string tensor_name, size_t vec_size) -> std::string {
 177         if (vec_size == 0 || vec_size > 8)
 178             throw std::invalid_argument("Invalid vector size in jit definitions");
 179         if (vec_size > 1)
 180             return "MAKE_VECTOR_TYPE(" + tensor_name + "_TYPE," + std::to_string(vec_size) + ")";
 181         else
 182             return tensor_name + "_TYPE";
 183     };
 184
 185     auto make_jit_load = [](std::string tensor_name, std::string ptr_name, size_t vec_size) -> std::string {
 186         if (vec_size == 0 || vec_size > 8)
 187             throw std::invalid_argument("Invalid vector size in jit definitions");
 188
 189         std::string index_func_call_vec = tensor_name + "_GET_INDEX(b, f_block*16, y, x)";
 190         std::string index_func_call = tensor_name + "_GET_INDEX(b, f_block*16, y, x+i)";
 191         if (vec_size > 1)
 192             return " UNIT_BLOCK_READ" + std::to_string(vec_size) + "(" + ptr_name + ", " + index_func_call_vec + ")";
 193         else
 194             return " UNIT_BLOCK_READ(" + ptr_name + ", " + index_func_call + ")";
 195     };
 196
 197     for (auto& fused_dep : params.fused_ops) {
 198         std::string op_type = "";
 199         switch (fused_dep.type) {
 200             case convolution_params::fused_operation_desc::Type::ELTWISE: {
 201                 op_type = "eltwise";
 202                 eltwise_fused_ops_vec += "dst = (dst + " + op_type + "_data);";
 203                 eltwise_fused_ops += "dst[i] = (dst[i] + " + op_type + "_data);";
 204                 break;
 205             }
 206             default:
 207                 throw std::invalid_argument("Invalid fused op in binary_convolution kernel: " + params.layerID);
 208         }
 209
 210         for (size_t op_input_id = 0; op_input_id < fused_dep.tensors.size(); op_input_id++) {
 211             std::string name = "FUSED_OP_" + std::to_string(op_id) + "_INPUT" + std::to_string(op_input_id);
 212             std::string ptr_name = op_type + "_input" + std::to_string(op_input_id);
 213
 214             std::string var_name = op_type + "_data";
 215             jit.AddConstant(MakeJitConstant(name, fused_dep.tensors[op_input_id]));
 216             input_decls += "const __global " + toCLType(fused_dep.tensors[op_input_id].GetDType()) +
 217                            "* " + ptr_name + ",";
 218             load_decls_vec += make_jit_vector_type(name, kd.cldnnStyle.blockWidth) + " " + var_name + " = " +
 219                               make_jit_load(name, ptr_name, kd.cldnnStyle.blockWidth) + ";";
 220             load_decls += make_jit_vector_type(name, 1) + " " + var_name + " = " +
 221                           make_jit_load(name, ptr_name, 1) + ";";
 222         }
 223
 224         if (fused_dep.activation.function != ActivationFunction::NONE) {
 225             std::string temp_op_type = op_type;
 226             for (auto& ch : temp_op_type)
 227                 ch = static_cast<char>(std::toupper(static_cast<unsigned char>(ch)));
 228             std::string suffix = "_" + temp_op_type;
 229
 230             jit.Merge(MakeActivationJitConstants(fused_dep.activation, suffix));
 231             eltwise_fused_ops_vec += "dst = ACTIVATION"+suffix+"(dst, ACTIVATION_PARAMS"+suffix+");";
 232             eltwise_fused_ops += "dst[i] = ACTIVATION"+suffix+"(dst[i], ACTIVATION_PARAMS"+suffix+");";
 233         }
 234         op_id++;
 235     }
 236     jit.AddConstant(MakeJitConstant("FUSED_OPS_DECLS", input_decls));
 237     jit.AddConstant(MakeJitConstant("FUSED_OPS_LOAD_DATA_VEC", load_decls_vec));
 238     jit.AddConstant(MakeJitConstant("FUSED_OPS_LOAD_DATA", load_decls));
 239     jit.AddConstant(MakeJitConstant("DO_ELTWISE_FUSED_OPS_VEC", eltwise_fused_ops_vec));
 240     jit.AddConstant(MakeJitConstant("DO_ELTWISE_FUSED_OPS", eltwise_fused_ops));
 241
 242     return jit;
 243 }
 244
 245 KernelsData ConvolutionKernel_bfyx_to_bfyx_f16::GetKernelsDataForAutoTune(const Params& params,
 246                                                                           const optional_params& options) const {
 247     if (!Validate(params, options)) {
 248         return {};
 249     }
 250
 251     KernelsData res = {};
 252
 253     for (size_t i = 0; i < autoTuneOptions.size(); i++) {
 254         KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
 255         if (!kd.empty()) {
 256             res.emplace_back(kd[0]);
 257         }
 258     }
 259
 260     return res;
 261 }
 262 }  // namespace kernel_selector