2 // Copyright (c) 2019 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "eltwise_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "eltwise/eltwise_kernel_selector.h"
23 #include "eltwise/eltwise_kernel_base.h"
30 inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) {
32 case eltwise_mode::sum:
33 return kernel_selector::eltwise_mode::ADD;
34 case eltwise_mode::sub:
35 return kernel_selector::eltwise_mode::SUB;
36 case eltwise_mode::max:
37 return kernel_selector::eltwise_mode::MAX;
38 case eltwise_mode::prod:
39 return kernel_selector::eltwise_mode::MUL;
40 case eltwise_mode::div:
41 return kernel_selector::eltwise_mode::DIV;
42 case eltwise_mode::min:
43 return kernel_selector::eltwise_mode::MIN;
44 case eltwise_mode::pow:
45 return kernel_selector::eltwise_mode::POW;
46 case eltwise_mode::mod:
47 return kernel_selector::eltwise_mode::MODULU;
48 case eltwise_mode::eq:
49 return kernel_selector::eltwise_mode::EQ;
50 case eltwise_mode::ne:
51 return kernel_selector::eltwise_mode::NE;
52 case eltwise_mode::lt:
53 return kernel_selector::eltwise_mode::LT;
54 case eltwise_mode::le:
55 return kernel_selector::eltwise_mode::LE;
56 case eltwise_mode::gt:
57 return kernel_selector::eltwise_mode::GT;
58 case eltwise_mode::ge:
59 return kernel_selector::eltwise_mode::GE;
60 case eltwise_mode::logic_and:
61 return kernel_selector::eltwise_mode::LOGIC_AND;
62 case eltwise_mode::logic_or:
63 return kernel_selector::eltwise_mode::LOGIC_OR;
64 case eltwise_mode::logic_xor:
65 return kernel_selector::eltwise_mode::LOGIC_XOR;
66 case eltwise_mode::squared_diff:
67 return kernel_selector::eltwise_mode::SQUARED_DIFF;
68 case eltwise_mode::floor_mod:
69 return kernel_selector::eltwise_mode::FLOOR_MOD;
71 return kernel_selector::eltwise_mode::ADD;
76 struct eltwise_gpu : typed_primitive_gpu_impl<eltwise> {
77 using parent = typed_primitive_gpu_impl<eltwise>;
81 kernel::kernel_arguments_data get_arguments(typed_primitive_inst<eltwise>& instance,
82 int32_t split) const override {
83 kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
85 args.output_calibration_factors =
86 (memory_impl::cptr) (instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory() : nullptr);
87 // TODO Inputs calibration factors - skipping for now as currently they should never be used in eltwise, create
94 static primitive_impl* create(const eltwise_node& arg) {
95 auto ew_params = get_default_params<kernel_selector::eltwise_params>(arg);
96 auto ew_optional_params =
97 get_default_optional_params<kernel_selector::eltwise_optional_params>(arg.get_program());
99 for (size_t i = 1; i < arg.inputs_count(); i++) {
100 ew_params.inputs.push_back(convert_data_tensor(arg.input(i).get_output_layout()));
103 const auto& primitive = arg.get_primitive();
105 ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0),
106 kernel_selector::eltwise_params::InputType::Buffer(1)},
107 convert_to_eltwise_mode(primitive->mode)});
109 for (uint32_t i = 2; i < static_cast<uint32_t>(arg.inputs_count()); i++) {
110 ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(i - 2),
111 kernel_selector::eltwise_params::InputType::Buffer(i)},
112 convert_to_eltwise_mode(primitive->mode)});
115 if (primitive->mode == eltwise_mode::sum) {
116 ew_params.coefficients = primitive->coefficients;
119 for (size_t i = 0; i < ew_params.inputs.size(); i++) {
120 if (!ew_params.inputs[i].SameDims(ew_params.output)) {
121 std::vector<int32_t> input_size = arg.input(i).get_output_layout().size.raw.vector();
122 std::vector<int32_t> output_size = arg.get_output_layout().size.raw.vector();
123 bool broadcast = false;
124 for (size_t d = 0; d < output_size.size(); d++) {
125 if (output_size[d] != 1 && input_size[d] == 1)
129 ew_params.broadcast = true;
132 ew_params.layoutBased = true;
139 if (!primitive->stride.empty()) {
140 const auto& stride = primitive->stride;
141 ew_params.stride.resize(stride.size());
142 for (size_t i = 0; i < primitive->stride.size(); i++) {
143 ew_params.stride[i] = {(uint32_t)stride[i].spatial[0],
144 (uint32_t)stride[i].spatial[1],
145 (uint32_t)stride[i].spatial[2]};
149 // check if strides are the same
150 if (!ew_params.stride.empty()) {
151 const auto& stride = ew_params.stride[0];
152 for (size_t i = 1; i < ew_params.stride.size(); i++) {
153 if (stride.x != ew_params.stride[i].x || stride.y != ew_params.stride[i].y)
154 ew_params.layoutBased = true;
156 } else if (!ew_params.inputs[0].SameDimsSizes(ew_params.inputs[1])) {
157 ew_params.broadcast = true;
160 if (primitive->output_calibration_factors.size() > 0 || primitive->output_quantization_factor != 1.0f) {
161 ew_params.int8_quantization = true;
163 if (primitive->output_calibration_factors.size() > 0) {
164 ew_params.output_calibration = true;
165 ew_params.output_calibration_factors.push_back(
166 convert_data_tensor(arg.output_calibration_factors().get_output_layout())
167 .FlattenFeatureAndSpatials());
169 ew_params.output_quantization_factor = arg.get_output_qf();
173 CLDNN_ERROR_BOOL(arg.id(),
174 "Eltwise inputs calibration term",
175 arg.inputs_calibration_term(),
176 "Eltwise does not yet support inputs calibration, it should be fused with convolution");
178 if (arg.inputs_calibration_term()) {
179 ew_params.int8_quantization = true;
180 ew_params.inputs_calibration = true;
182 for (size_t i = 0; i < primitive->inputs_calibration_factors.size(); ++i) {
183 auto icf_layout = arg.input_calibration_factors(i).get_output_layout();
184 ew_params.inputs_calibration_factors.push_back(
185 convert_data_tensor(icf_layout).FlattenFeatureAndSpatials());
189 CLDNN_ERROR_BOOL(arg.id(),
190 "Eltwise inputs quantization term",
191 arg.inputs_quantization_term(),
192 "Eltwise does not yet support inputs quantization, it should be fused with convolution");
194 if (arg.inputs_quantization_term()) {
195 ew_params.int8_quantization = true;
197 for (const auto& iqf : primitive->input_quantization_factors) {
198 ew_params.input_quantization_factors.push_back(iqf);
202 auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
203 auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
205 CLDNN_ERROR_BOOL(arg.id(),
206 "Best_kernel.empty()",
207 best_kernels.empty(),
208 "Cannot find a proper kernel with this arguments");
210 auto eltwise = new eltwise_gpu(arg, best_kernels[0]);
218 attach_eltwise_gpu::attach_eltwise_gpu() {
219 implementation_map<eltwise>::add(
220 {{std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), eltwise_gpu::create},
221 {std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), eltwise_gpu::create},
222 {std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), eltwise_gpu::create},
223 {std::make_tuple(engine_types::ocl, data_types::i32, format::yxfb), eltwise_gpu::create},
224 {std::make_tuple(engine_types::ocl, data_types::i64, format::yxfb), eltwise_gpu::create},
225 {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), eltwise_gpu::create},
226 {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), eltwise_gpu::create},
227 {std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), eltwise_gpu::create},
228 {std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), eltwise_gpu::create},
229 {std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), eltwise_gpu::create},
230 {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), eltwise_gpu::create},
231 {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), eltwise_gpu::create},
232 {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), eltwise_gpu::create},
233 {std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), eltwise_gpu::create},
234 {std::make_tuple(engine_types::ocl, data_types::i64, format::byxf), eltwise_gpu::create},
236 {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx_f16), eltwise_gpu::create},
237 {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx_f16), eltwise_gpu::create},
239 {std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), eltwise_gpu::create},
240 {std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), eltwise_gpu::create},
241 {std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx), eltwise_gpu::create},
242 {std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx), eltwise_gpu::create},
243 {std::make_tuple(engine_types::ocl, data_types::i64, format::bfzyx), eltwise_gpu::create},
244 {std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx_f16), eltwise_gpu::create},
245 {std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx_f16), eltwise_gpu::create},
246 {std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx_f16), eltwise_gpu::create},
247 {std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx_f16), eltwise_gpu::create},
248 {std::make_tuple(engine_types::ocl, data_types::i64, format::bfzyx_f16), eltwise_gpu::create},
250 {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create},
251 {std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create},
252 {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create},
253 {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create},
255 {std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), eltwise_gpu::create}});
258 } // namespace detail