b086560cd25c5c91dc88416e48e9191b8c40cf00
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / eltwise_gpu.cpp
1 /*
2 // Copyright (c) 2019 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "eltwise_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "eltwise/eltwise_kernel_selector.h"
23 #include "eltwise/eltwise_kernel_base.h"
24 #include <vector>
25
26 namespace cldnn {
27 namespace gpu {
28
29 namespace {
30 inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) {
31     switch (mode) {
32         case eltwise_mode::sum:
33             return kernel_selector::eltwise_mode::ADD;
34         case eltwise_mode::sub:
35             return kernel_selector::eltwise_mode::SUB;
36         case eltwise_mode::max:
37             return kernel_selector::eltwise_mode::MAX;
38         case eltwise_mode::prod:
39             return kernel_selector::eltwise_mode::MUL;
40         case eltwise_mode::div:
41             return kernel_selector::eltwise_mode::DIV;
42         case eltwise_mode::min:
43             return kernel_selector::eltwise_mode::MIN;
44         case eltwise_mode::pow:
45             return kernel_selector::eltwise_mode::POW;
46         case eltwise_mode::mod:
47             return kernel_selector::eltwise_mode::MODULU;
48         case eltwise_mode::eq:
49             return kernel_selector::eltwise_mode::EQ;
50         case eltwise_mode::ne:
51             return kernel_selector::eltwise_mode::NE;
52         case eltwise_mode::lt:
53             return kernel_selector::eltwise_mode::LT;
54         case eltwise_mode::le:
55             return kernel_selector::eltwise_mode::LE;
56         case eltwise_mode::gt:
57             return kernel_selector::eltwise_mode::GT;
58         case eltwise_mode::ge:
59             return kernel_selector::eltwise_mode::GE;
60         case eltwise_mode::logic_and:
61             return kernel_selector::eltwise_mode::LOGIC_AND;
62         case eltwise_mode::logic_or:
63             return kernel_selector::eltwise_mode::LOGIC_OR;
64         case eltwise_mode::logic_xor:
65             return kernel_selector::eltwise_mode::LOGIC_XOR;
66         case eltwise_mode::squared_diff:
67             return kernel_selector::eltwise_mode::SQUARED_DIFF;
68         case eltwise_mode::floor_mod:
69             return kernel_selector::eltwise_mode::FLOOR_MOD;
70         default:
71             return kernel_selector::eltwise_mode::ADD;
72     }
73 }
74 }  // namespace
75
76 struct eltwise_gpu : typed_primitive_gpu_impl<eltwise> {
77     using parent = typed_primitive_gpu_impl<eltwise>;
78     using parent::parent;
79
80 protected:
81     kernel::kernel_arguments_data get_arguments(typed_primitive_inst<eltwise>& instance,
82                                                         int32_t split) const override {
83         kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
84
85         args.output_calibration_factors =
86             (memory_impl::cptr) (instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory() : nullptr);
87         // TODO Inputs calibration factors - skipping for now as currently they should never be used in eltwise, create
88         // will throw
89
90         return args;
91     }
92
93 public:
94     static primitive_impl* create(const eltwise_node& arg) {
95         auto ew_params = get_default_params<kernel_selector::eltwise_params>(arg);
96         auto ew_optional_params =
97             get_default_optional_params<kernel_selector::eltwise_optional_params>(arg.get_program());
98
99         for (size_t i = 1; i < arg.inputs_count(); i++) {
100             ew_params.inputs.push_back(convert_data_tensor(arg.input(i).get_output_layout()));
101         }
102
103         const auto& primitive = arg.get_primitive();
104         if (primitive->with_activation)
105             convert_activation_func_params(primitive, ew_params.activation);
106
107         ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0),
108                                          kernel_selector::eltwise_params::InputType::Buffer(1)},
109                                         convert_to_eltwise_mode(primitive->mode)});
110
111         for (uint32_t i = 2; i < static_cast<uint32_t>(arg.inputs_count()); i++) {
112             ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(i - 2),
113                                              kernel_selector::eltwise_params::InputType::Buffer(i)},
114                                             convert_to_eltwise_mode(primitive->mode)});
115         }
116
117         if (primitive->mode == eltwise_mode::sum) {
118             ew_params.coefficients = primitive->coefficients;
119         }
120
121         for (size_t i = 0; i < ew_params.inputs.size(); i++) {
122             if (!ew_params.inputs[i].SameDims(ew_params.output)) {
123                 std::vector<int32_t> input_size = arg.input(i).get_output_layout().size.raw.vector();
124                 std::vector<int32_t> output_size = arg.get_output_layout().size.raw.vector();
125                 bool broadcast = false;
126                 for (size_t d = 0; d < output_size.size(); d++) {
127                     if (output_size[d] != 1 && input_size[d] == 1)
128                         broadcast = true;
129                 }
130                 if (broadcast) {
131                     ew_params.broadcast = true;
132                     break;
133                 } else {
134                     ew_params.layoutBased = true;
135                     break;
136                 }
137             }
138         }
139
140         // stride
141         if (!primitive->stride.empty()) {
142             const auto& stride = primitive->stride;
143             ew_params.stride.resize(stride.size());
144             for (size_t i = 0; i < primitive->stride.size(); i++) {
145                 ew_params.stride[i] = {(uint32_t)stride[i].spatial[0],
146                                        (uint32_t)stride[i].spatial[1],
147                                        (uint32_t)stride[i].spatial[2]};
148             }
149         }
150
151         // check if strides are the same
152         if (!ew_params.stride.empty()) {
153             const auto& stride = ew_params.stride[0];
154             for (size_t i = 1; i < ew_params.stride.size(); i++) {
155                 if (stride.x != ew_params.stride[i].x || stride.y != ew_params.stride[i].y)
156                     ew_params.layoutBased = true;
157             }
158         } else if (!ew_params.inputs[0].SameDimsSizes(ew_params.inputs[1])) {
159             ew_params.broadcast = true;
160         }
161
162         if (primitive->output_calibration_factors.size() > 0 || primitive->output_quantization_factor != 1.0f) {
163             ew_params.int8_quantization = true;
164
165             if (primitive->output_calibration_factors.size() > 0) {
166                 ew_params.output_calibration = true;
167                 ew_params.output_calibration_factors.push_back(
168                     convert_data_tensor(arg.output_calibration_factors().get_output_layout())
169                         .FlattenFeatureAndSpatials());
170             } else {
171                 ew_params.output_quantization_factor = arg.get_output_qf();
172             }
173         }
174
175         CLDNN_ERROR_BOOL(arg.id(),
176                          "Eltwise inputs calibration term",
177                          arg.inputs_calibration_term(),
178                          "Eltwise does not yet support inputs calibration, it should be fused with convolution");
179
180         if (arg.inputs_calibration_term()) {
181             ew_params.int8_quantization = true;
182             ew_params.inputs_calibration = true;
183
184             for (size_t i = 0; i < primitive->inputs_calibration_factors.size(); ++i) {
185                 auto icf_layout = arg.input_calibration_factors(i).get_output_layout();
186                 ew_params.inputs_calibration_factors.push_back(
187                     convert_data_tensor(icf_layout).FlattenFeatureAndSpatials());
188             }
189         }
190
191         CLDNN_ERROR_BOOL(arg.id(),
192                          "Eltwise inputs quantization term",
193                          arg.inputs_quantization_term(),
194                          "Eltwise does not yet support inputs quantization, it should be fused with convolution");
195
196         if (arg.inputs_quantization_term()) {
197             ew_params.int8_quantization = true;
198
199             for (const auto& iqf : primitive->input_quantization_factors) {
200                 ew_params.input_quantization_factors.push_back(iqf);
201             }
202         }
203
204         auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
205         auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
206
207         CLDNN_ERROR_BOOL(arg.id(),
208                          "Best_kernel.empty()",
209                          best_kernels.empty(),
210                          "Cannot find a proper kernel with this arguments");
211
212         auto eltwise = new eltwise_gpu(arg, best_kernels[0]);
213
214         return eltwise;
215     }
216 };
217
218 namespace {
219 struct attach {
220     attach() {
221         implementation_map<eltwise>::add(
222             {{std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), eltwise_gpu::create},
223              {std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), eltwise_gpu::create},
224              {std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), eltwise_gpu::create},
225              {std::make_tuple(engine_types::ocl, data_types::i32, format::yxfb), eltwise_gpu::create},
226              {std::make_tuple(engine_types::ocl, data_types::i64, format::yxfb), eltwise_gpu::create},
227              {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), eltwise_gpu::create},
228              {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), eltwise_gpu::create},
229              {std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), eltwise_gpu::create},
230              {std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), eltwise_gpu::create},
231              {std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), eltwise_gpu::create},
232              {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), eltwise_gpu::create},
233              {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), eltwise_gpu::create},
234              {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), eltwise_gpu::create},
235              {std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), eltwise_gpu::create},
236              {std::make_tuple(engine_types::ocl, data_types::i64, format::byxf), eltwise_gpu::create},
237              // block f16
238              {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx_f16), eltwise_gpu::create},
239              {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx_f16), eltwise_gpu::create},
240              // 3D
241              {std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), eltwise_gpu::create},
242              {std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), eltwise_gpu::create},
243              {std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx), eltwise_gpu::create},
244              {std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx), eltwise_gpu::create},
245              {std::make_tuple(engine_types::ocl, data_types::i64, format::bfzyx), eltwise_gpu::create},
246              // MMAD
247              {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create},
248              {std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create},
249              {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create},
250              {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create},
251              //
252              {std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), eltwise_gpu::create}});
253     }
254     ~attach() {}
255 };
256 attach attach_impl;
257 }  // namespace
258 }  // namespace gpu
259 }  // namespace cldnn