2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "fused_conv_eltwise_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "kernel_runner.h"
23 #include "fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h"
24 #include "fused_conv_eltwise/fused_conv_eltwise_kernel_base.h"
26 namespace cldnn { namespace gpu {
28 struct fused_conv_eltwise_gpu : typed_primitive_gpu_impl<fused_conv_eltwise>
30 using parent = typed_primitive_gpu_impl<fused_conv_eltwise>;
35 virtual bool validate_impl(const typed_primitive_inst<fused_conv_eltwise>& instance) const override
39 auto outer_id = _outer.id();
40 auto data_type = instance.node.input().get_output_layout().data_type;
42 // Check whether all memory elements use the same unit type (FP16 or FP32).
43 CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "output memory", instance.node.get_output_layout().data_type, "");
44 CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, "");
49 virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst<fused_conv_eltwise>& instance, int32_t split) const override
51 kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
53 args.weights = &instance.weights_memory(split);
54 args.bias = instance.bias_term() ? &instance.bias_memory(split) : nullptr;
55 args.weights_quantization_factors = instance.weights_quantization_factors_term() ? &instance.weights_quantization_factors_memory(split) : nullptr;
56 args.output_calibration_factors = instance.conv_output_calibration_factors_term() ? &instance.output_calibration_factors_memory(split) : nullptr;
57 if (instance.eltw_output_calibration_factors_term())
58 args.fused_op_calibration_factors.push_back(&instance.eltw_output_calibration_factors_memory());
62 virtual int32_t get_split() const override
64 return _outer.get_split();
69 static primitive_impl* create(const fused_conv_eltwise_node &arg)
71 const auto& primitive = arg.get_primitive();
72 const auto& input_layout = arg.input().get_output_layout();
73 const auto& weights_layout = arg.weights(0).get_output_layout();
74 const auto& weights_size = weights_layout.size;
76 const auto& split = primitive->split();
77 const auto& stride = primitive->conv.stride;
78 const auto& dilation = primitive->conv.dilation;
79 const auto& input_offset = primitive->conv.input_offset;
81 const auto depthwise_separable_opt = arg.get_depthwise_sep_opt();
82 const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split;
84 const auto transposed = arg.get_transposed();
86 assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]);
89 auto fused_params = get_weights_bias_default_params<kernel_selector::fused_conv_eltwise_params>(arg, actual_split);
90 // add second input for eltwise
91 if (!static_cast<const fused_conv_eltwise*>(arg.get_primitive().get())->second_input_in_output)
93 fused_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
96 auto& conv_params = fused_params.conv;
97 auto& eltw_params = fused_params.eltw;
99 auto conv_optional_params = get_default_weights_bias_optional_params<kernel_selector::fused_conv_eltwise_optional_params>(arg.get_program());
101 const auto additional_offset = tensor::max(input_offset, 0);
102 if (additional_offset != 0)
104 fused_params.inputs[0] = convert_data_tensor(input_layout, actual_split, additional_offset);
107 if (primitive->conv.with_activation)
109 convert_activation_func_params(&primitive->conv, fused_params.activation);
111 if (primitive->eltw.with_activation)
113 convert_activation_func_params(&primitive->eltw, fused_params.eltw.activation);
116 fused_params.conv.depthwise_separable_opt = depthwise_separable_opt;
117 fused_params.conv.transposed = transposed;
119 fused_params.second_input_in_output = primitive->second_input_in_output;
121 conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1;
122 conv_params.split = split;
123 conv_params.filterSize = {
124 (uint32_t)weights_size.spatial[0],
125 (uint32_t)weights_size.spatial[1],
128 conv_params.padding = {
129 (uint32_t)std::max(-input_offset.spatial[0], 0),
130 (uint32_t)std::max(-input_offset.spatial[1], 0)
133 conv_params.stride = {
134 (uint32_t)stride.spatial[0],
135 (uint32_t)stride.spatial[1]
137 conv_params.dilation = {
138 (uint32_t)dilation.spatial[0],
139 (uint32_t)dilation.spatial[1]
142 if (primitive->conv.weights_quantization_factors.size() > 0)
144 conv_params.int8_quantization = true;
145 conv_params.weights_quantization_factors.push_back(convert_data_tensor(arg.weights_quantization_factors().get_output_layout()).FlattenFeatureAndSpatials());
146 conv_params.input_quantization_factor = arg.get_conv_input_qf();
148 if (primitive->conv.output_calibration_factors.size() > 0)
150 conv_params.output_calibration = true;
151 conv_params.output_calibration_factors.push_back(convert_data_tensor(arg.conv_output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
154 conv_params.output_quantization_factor = arg.get_conv_output_qf();
158 if (primitive->eltw.output_calibration_factors.size() > 0 || primitive->eltw.output_quantization_factor != 1.0f)
160 eltw_params.int8_quantization = true;
162 if (primitive->eltw.output_calibration_factors.size() > 0)
164 eltw_params.output_calibration = true;
165 eltw_params.output_calibration_factors.push_back(convert_data_tensor(arg.eltw_output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
168 eltw_params.output_quantization_factor = arg.get_eltw_output_qf();
172 if (!primitive->eltw.stride.empty())
174 const auto& eltw_stride = primitive->eltw.stride;
175 eltw_params.stride.resize(eltw_stride.size());
176 for (size_t i = 0; i < primitive->eltw.stride.size(); i++)
178 eltw_params.stride[i] = { (uint32_t)eltw_stride[i].spatial[0], (uint32_t)eltw_stride[i].spatial[1] };
182 auto& kernel_selector = kernel_selector::fused_conv_eltwise_kernel_selector::Instance();
184 const auto& tuning_config = arg.get_program().get_options().get<build_option_type::tuning_config>();
186 if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache)
188 conv_optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), true);
191 kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(fused_params, conv_optional_params);
193 CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments");
195 auto conv = new fused_conv_eltwise_gpu(arg, best_kernels[0]);
204 implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), fused_conv_eltwise_gpu::create);
205 implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), fused_conv_eltwise_gpu::create);
206 implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), fused_conv_eltwise_gpu::create);
208 implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), fused_conv_eltwise_gpu::create);