1 // Copyright (c) 2016-2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "convolution_kernel_bfyx_to_bfyx_f16.h"
17 #include "kernel_selector_utils.h"
21 namespace kernel_selector {
23 static const size_t sub_group_size = 16;
24 static const size_t feature_block_size = 16;
26 ConvolutionKernel_bfyx_to_bfyx_f16::ConvolutionKernel_bfyx_to_bfyx_f16()
27 : ConvolutionKernelBase("convolution_gpu_bfyx_to_bfyx_f16") {
28 std::vector<size_t> outputBlockWidths = {2, 4, 8};
29 std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
31 for (auto w : outputBlockWidths) {
32 for (auto exeMode : executionModes) {
33 autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
38 ConvolutionKernel_bfyx_to_bfyx_f16::AutoTuneOption ConvolutionKernel_bfyx_to_bfyx_f16::GetAutoTuneOptions(
39 const Params& /* arg*/,
40 int autoTuneIndex) const {
41 if (autoTuneIndex >= 0 && autoTuneIndex < static_cast<int>(autoTuneOptions.size()))
42 return autoTuneOptions[autoTuneIndex];
44 return {8, AGE_BASED};
47 ParamsKey ConvolutionKernel_bfyx_to_bfyx_f16::GetSupportedKey() const {
49 k.EnableInputDataType(Datatype::F16);
50 k.EnableInputDataType(Datatype::F32);
51 k.EnableOutputDataType(Datatype::F16);
52 k.EnableOutputDataType(Datatype::F32);
53 k.EnableInputWeightsType(WeightsType::F16);
54 k.EnableInputWeightsType(WeightsType::F32);
55 k.EnableInputLayout(DataLayout::bfyx);
56 k.EnableOutputLayout(DataLayout::bfyx_f16);
57 k.EnableTensorOffset();
58 k.EnableTensorPitches();
59 // TODO Add dilation support to kernel
60 // k.EnableDilation();
61 k.EnableBiasPerFeature();
62 // TODO Add bias per output support to kernel
63 // k.EnableBiasPerOutput();
64 k.EnableNonBiasTerm();
65 k.EnableSplitSupport();
66 k.EnableDepthwiseSeparableOpt();
69 k.EnableSubGroupShort();
73 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_to_bfyx_f16::SetDefault(const convolution_params& params,
74 int autoTuneIndex) const {
75 DispatchData kd = ConvolutionKernelBase::SetDefault(params);
77 const auto& out = params.output;
79 auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
80 kd.cldnnStyle.blockWidth = autoTune.blockWidth;
84 auto f = out.Feature().v;
85 auto b = out.Batch().v;
87 kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
88 kd.gws1 = Align(f, sub_group_size);
92 kd.lws1 = sub_group_size;
96 kd.effiency = FORCE_PRIORITY_2;
98 kd.effiency = FORCE_PRIORITY_7;
103 bool ConvolutionKernel_bfyx_to_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
104 if (!ConvolutionKernelBase::Validate(p, o)) {
108 const auto& params = static_cast<const convolution_params&>(p);
110 const auto& input = params.inputs[0];
111 const auto& output = params.output;
113 // TODO Add support for different input features number in kernel
114 if (input.Feature().v != 3) {
118 // Check that padding before features doesn't miss-align the blocks
119 if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0) {
126 JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetJitConstants(const convolution_params& params,
127 const DispatchData& runInfo) const {
128 auto input = params.inputs[0];
129 auto output = params.output;
130 auto jit = Parent::GetJitConstants(params, runInfo);
132 auto blockWidth = runInfo.cldnnStyle.blockWidth;
134 size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1,
135 input.X().v + input.X().pad.Total());
136 size_t input_block_size = CeilDiv(input_line_size * params.filterSize.y, sub_group_size);
138 jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
140 jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
141 jit.AddConstant(MakeJitConstant("INPUT_BLOCK_SIZE", input_block_size));
143 jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
144 jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
146 if (params.output.Feature().v % feature_block_size != 0) {
147 jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
153 KernelsData ConvolutionKernel_bfyx_to_bfyx_f16::GetTunedKernelsDataByIndex(const Params& params,
154 const optional_params& options,
155 const int autoTuneIndex) const {
156 auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
157 return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
160 KernelsData ConvolutionKernel_bfyx_to_bfyx_f16::GetKernelsData(const Params& params,
161 const optional_params& options) const {
162 return GetTunedKernelsDataByIndex(params, options);
165 JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetFusedPrimitivesJitConstants(const convolution_params& params,
166 const DispatchData& kd) const {
167 JitConstants jit = {};
170 std::string input_decls = "";
171 std::string load_decls_vec = "";
172 std::string load_decls = "";
173 std::string eltwise_fused_ops_vec = "";
174 std::string eltwise_fused_ops = "";
176 auto make_jit_vector_type = [](std::string tensor_name, size_t vec_size) -> std::string {
177 if (vec_size == 0 || vec_size > 8)
178 throw std::invalid_argument("Invalid vector size in jit definitions");
180 return "MAKE_VECTOR_TYPE(" + tensor_name + "_TYPE," + std::to_string(vec_size) + ")";
182 return tensor_name + "_TYPE";
185 auto make_jit_load = [](std::string tensor_name, std::string ptr_name, size_t vec_size) -> std::string {
186 if (vec_size == 0 || vec_size > 8)
187 throw std::invalid_argument("Invalid vector size in jit definitions");
189 std::string index_func_call_vec = tensor_name + "_GET_INDEX(b, f_block*16, y, x)";
190 std::string index_func_call = tensor_name + "_GET_INDEX(b, f_block*16, y, x+i)";
192 return " UNIT_BLOCK_READ" + std::to_string(vec_size) + "(" + ptr_name + ", " + index_func_call_vec + ")";
194 return " UNIT_BLOCK_READ(" + ptr_name + ", " + index_func_call + ")";
197 for (auto& fused_dep : params.fused_ops) {
198 std::string op_type = "";
199 switch (fused_dep.type) {
200 case convolution_params::fused_operation_desc::Type::ELTWISE: {
202 eltwise_fused_ops_vec += "dst = (dst + " + op_type + "_data);";
203 eltwise_fused_ops += "dst[i] = (dst[i] + " + op_type + "_data);";
207 throw std::invalid_argument("Invalid fused op in binary_convolution kernel: " + params.layerID);
210 for (size_t op_input_id = 0; op_input_id < fused_dep.tensors.size(); op_input_id++) {
211 std::string name = "FUSED_OP_" + std::to_string(op_id) + "_INPUT" + std::to_string(op_input_id);
212 std::string ptr_name = op_type + "_input" + std::to_string(op_input_id);
214 std::string var_name = op_type + "_data";
215 jit.AddConstant(MakeJitConstant(name, fused_dep.tensors[op_input_id]));
216 input_decls += "const __global " + toCLType(fused_dep.tensors[op_input_id].GetDType()) +
217 "* " + ptr_name + ",";
218 load_decls_vec += make_jit_vector_type(name, kd.cldnnStyle.blockWidth) + " " + var_name + " = " +
219 make_jit_load(name, ptr_name, kd.cldnnStyle.blockWidth) + ";";
220 load_decls += make_jit_vector_type(name, 1) + " " + var_name + " = " +
221 make_jit_load(name, ptr_name, 1) + ";";
224 if (fused_dep.activation.function != ActivationFunction::NONE) {
225 std::string temp_op_type = op_type;
226 for (auto& ch : temp_op_type)
227 ch = static_cast<char>(std::toupper(static_cast<unsigned char>(ch)));
228 std::string suffix = "_" + temp_op_type;
230 jit.Merge(MakeActivationJitConstants(fused_dep.activation, suffix));
231 eltwise_fused_ops_vec += "dst = ACTIVATION"+suffix+"(dst, ACTIVATION_PARAMS"+suffix+");";
232 eltwise_fused_ops += "dst[i] = ACTIVATION"+suffix+"(dst[i], ACTIVATION_PARAMS"+suffix+");";
236 jit.AddConstant(MakeJitConstant("FUSED_OPS_DECLS", input_decls));
237 jit.AddConstant(MakeJitConstant("FUSED_OPS_LOAD_DATA_VEC", load_decls_vec));
238 jit.AddConstant(MakeJitConstant("FUSED_OPS_LOAD_DATA", load_decls));
239 jit.AddConstant(MakeJitConstant("DO_ELTWISE_FUSED_OPS_VEC", eltwise_fused_ops_vec));
240 jit.AddConstant(MakeJitConstant("DO_ELTWISE_FUSED_OPS", eltwise_fused_ops));
245 KernelsData ConvolutionKernel_bfyx_to_bfyx_f16::GetKernelsDataForAutoTune(const Params& params,
246 const optional_params& options) const {
247 if (!Validate(params, options)) {
251 KernelsData res = {};
253 for (size_t i = 0; i < autoTuneOptions.size(); i++) {
254 KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
256 res.emplace_back(kd[0]);
262 } // namespace kernel_selector