910dcc4a5a3ecdeeee7c35da315abab3ddcee464
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / eltwise_gpu.cpp
1 /*
2 // Copyright (c) 2019 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "eltwise_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "eltwise/eltwise_kernel_selector.h"
23 #include "eltwise/eltwise_kernel_base.h"
24 #include <vector>
25
26 namespace cldnn {
27 namespace gpu {
28
29 namespace {
30 inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) {
31     switch (mode) {
32         case eltwise_mode::sum:
33             return kernel_selector::eltwise_mode::ADD;
34         case eltwise_mode::sub:
35             return kernel_selector::eltwise_mode::SUB;
36         case eltwise_mode::max:
37             return kernel_selector::eltwise_mode::MAX;
38         case eltwise_mode::prod:
39             return kernel_selector::eltwise_mode::MUL;
40         case eltwise_mode::div:
41             return kernel_selector::eltwise_mode::DIV;
42         case eltwise_mode::min:
43             return kernel_selector::eltwise_mode::MIN;
44         case eltwise_mode::pow:
45             return kernel_selector::eltwise_mode::POW;
46         case eltwise_mode::mod:
47             return kernel_selector::eltwise_mode::MODULU;
48         case eltwise_mode::eq:
49             return kernel_selector::eltwise_mode::EQ;
50         case eltwise_mode::ne:
51             return kernel_selector::eltwise_mode::NE;
52         case eltwise_mode::lt:
53             return kernel_selector::eltwise_mode::LT;
54         case eltwise_mode::le:
55             return kernel_selector::eltwise_mode::LE;
56         case eltwise_mode::gt:
57             return kernel_selector::eltwise_mode::GT;
58         case eltwise_mode::ge:
59             return kernel_selector::eltwise_mode::GE;
60         case eltwise_mode::logic_and:
61             return kernel_selector::eltwise_mode::LOGIC_AND;
62         case eltwise_mode::logic_or:
63             return kernel_selector::eltwise_mode::LOGIC_OR;
64         case eltwise_mode::logic_xor:
65             return kernel_selector::eltwise_mode::LOGIC_XOR;
66         case eltwise_mode::squared_diff:
67             return kernel_selector::eltwise_mode::SQUARED_DIFF;
68         case eltwise_mode::floor_mod:
69             return kernel_selector::eltwise_mode::FLOOR_MOD;
70         default:
71             return kernel_selector::eltwise_mode::ADD;
72     }
73 }
74 }  // namespace
75
76 struct eltwise_gpu : typed_primitive_gpu_impl<eltwise> {
77     using parent = typed_primitive_gpu_impl<eltwise>;
78     using parent::parent;
79
80 protected:
81     kernel::kernel_arguments_data get_arguments(typed_primitive_inst<eltwise>& instance,
82                                                         int32_t split) const override {
83         kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
84
85         args.output_calibration_factors =
86             (memory_impl::cptr) (instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory() : nullptr);
87         // TODO Inputs calibration factors - skipping for now as currently they should never be used in eltwise, create
88         // will throw
89
90         return args;
91     }
92
93 public:
94     static primitive_impl* create(const eltwise_node& arg) {
95         auto ew_params = get_default_params<kernel_selector::eltwise_params>(arg);
96         auto ew_optional_params =
97             get_default_optional_params<kernel_selector::eltwise_optional_params>(arg.get_program());
98
99         for (size_t i = 1; i < arg.inputs_count(); i++) {
100             ew_params.inputs.push_back(convert_data_tensor(arg.input(i).get_output_layout()));
101         }
102
103         const auto& primitive = arg.get_primitive();
104
105         ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0),
106                                          kernel_selector::eltwise_params::InputType::Buffer(1)},
107                                         convert_to_eltwise_mode(primitive->mode)});
108
109         for (uint32_t i = 2; i < static_cast<uint32_t>(arg.inputs_count()); i++) {
110             ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(i - 2),
111                                              kernel_selector::eltwise_params::InputType::Buffer(i)},
112                                             convert_to_eltwise_mode(primitive->mode)});
113         }
114
115         if (primitive->mode == eltwise_mode::sum) {
116             ew_params.coefficients = primitive->coefficients;
117         }
118
119         for (size_t i = 0; i < ew_params.inputs.size(); i++) {
120             if (!ew_params.inputs[i].SameDims(ew_params.output)) {
121                 std::vector<int32_t> input_size = arg.input(i).get_output_layout().size.raw.vector();
122                 std::vector<int32_t> output_size = arg.get_output_layout().size.raw.vector();
123                 bool broadcast = false;
124                 for (size_t d = 0; d < output_size.size(); d++) {
125                     if (output_size[d] != 1 && input_size[d] == 1)
126                         broadcast = true;
127                 }
128                 if (broadcast) {
129                     ew_params.broadcast = true;
130                     break;
131                 } else {
132                     ew_params.layoutBased = true;
133                     break;
134                 }
135             }
136         }
137
138         // stride
139         if (!primitive->stride.empty()) {
140             const auto& stride = primitive->stride;
141             ew_params.stride.resize(stride.size());
142             for (size_t i = 0; i < primitive->stride.size(); i++) {
143                 ew_params.stride[i] = {(uint32_t)stride[i].spatial[0],
144                                        (uint32_t)stride[i].spatial[1],
145                                        (uint32_t)stride[i].spatial[2]};
146             }
147         }
148
149         // check if strides are the same
150         if (!ew_params.stride.empty()) {
151             const auto& stride = ew_params.stride[0];
152             for (size_t i = 1; i < ew_params.stride.size(); i++) {
153                 if (stride.x != ew_params.stride[i].x || stride.y != ew_params.stride[i].y)
154                     ew_params.layoutBased = true;
155             }
156         } else if (!ew_params.inputs[0].SameDimsSizes(ew_params.inputs[1])) {
157             ew_params.broadcast = true;
158         }
159
160         if (primitive->output_calibration_factors.size() > 0 || primitive->output_quantization_factor != 1.0f) {
161             ew_params.int8_quantization = true;
162
163             if (primitive->output_calibration_factors.size() > 0) {
164                 ew_params.output_calibration = true;
165                 ew_params.output_calibration_factors.push_back(
166                     convert_data_tensor(arg.output_calibration_factors().get_output_layout())
167                         .FlattenFeatureAndSpatials());
168             } else {
169                 ew_params.output_quantization_factor = arg.get_output_qf();
170             }
171         }
172
173         CLDNN_ERROR_BOOL(arg.id(),
174                          "Eltwise inputs calibration term",
175                          arg.inputs_calibration_term(),
176                          "Eltwise does not yet support inputs calibration, it should be fused with convolution");
177
178         if (arg.inputs_calibration_term()) {
179             ew_params.int8_quantization = true;
180             ew_params.inputs_calibration = true;
181
182             for (size_t i = 0; i < primitive->inputs_calibration_factors.size(); ++i) {
183                 auto icf_layout = arg.input_calibration_factors(i).get_output_layout();
184                 ew_params.inputs_calibration_factors.push_back(
185                     convert_data_tensor(icf_layout).FlattenFeatureAndSpatials());
186             }
187         }
188
189         CLDNN_ERROR_BOOL(arg.id(),
190                          "Eltwise inputs quantization term",
191                          arg.inputs_quantization_term(),
192                          "Eltwise does not yet support inputs quantization, it should be fused with convolution");
193
194         if (arg.inputs_quantization_term()) {
195             ew_params.int8_quantization = true;
196
197             for (const auto& iqf : primitive->input_quantization_factors) {
198                 ew_params.input_quantization_factors.push_back(iqf);
199             }
200         }
201
202         auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
203         auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
204
205         CLDNN_ERROR_BOOL(arg.id(),
206                          "Best_kernel.empty()",
207                          best_kernels.empty(),
208                          "Cannot find a proper kernel with this arguments");
209
210         auto eltwise = new eltwise_gpu(arg, best_kernels[0]);
211
212         return eltwise;
213     }
214 };
215
216 namespace detail {
217
218 attach_eltwise_gpu::attach_eltwise_gpu() {
219     implementation_map<eltwise>::add(
220         {{std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), eltwise_gpu::create},
221          {std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), eltwise_gpu::create},
222          {std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), eltwise_gpu::create},
223          {std::make_tuple(engine_types::ocl, data_types::i32, format::yxfb), eltwise_gpu::create},
224          {std::make_tuple(engine_types::ocl, data_types::i64, format::yxfb), eltwise_gpu::create},
225          {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), eltwise_gpu::create},
226          {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), eltwise_gpu::create},
227          {std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), eltwise_gpu::create},
228          {std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), eltwise_gpu::create},
229          {std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), eltwise_gpu::create},
230          {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), eltwise_gpu::create},
231          {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), eltwise_gpu::create},
232          {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), eltwise_gpu::create},
233          {std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), eltwise_gpu::create},
234          {std::make_tuple(engine_types::ocl, data_types::i64, format::byxf), eltwise_gpu::create},
235          // block f16
236          {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx_f16), eltwise_gpu::create},
237          {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx_f16), eltwise_gpu::create},
238          // 3D
239          {std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), eltwise_gpu::create},
240          {std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), eltwise_gpu::create},
241          {std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx), eltwise_gpu::create},
242          {std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx), eltwise_gpu::create},
243          {std::make_tuple(engine_types::ocl, data_types::i64, format::bfzyx), eltwise_gpu::create},
244          {std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx_f16), eltwise_gpu::create},
245          {std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx_f16), eltwise_gpu::create},
246          {std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx_f16), eltwise_gpu::create},
247          {std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx_f16), eltwise_gpu::create},
248          {std::make_tuple(engine_types::ocl, data_types::i64, format::bfzyx_f16), eltwise_gpu::create},
249          // MMAD
250          {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create},
251          {std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create},
252          {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create},
253          {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create},
254          //
255          {std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), eltwise_gpu::create}});
256 }
257
258 }  // namespace detail
259 }  // namespace gpu
260 }  // namespace cldnn