Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / fused_conv_eltwise_gpu.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "fused_conv_eltwise_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "kernel_runner.h"
23 #include "fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h"
24 #include "fused_conv_eltwise/fused_conv_eltwise_kernel_base.h"
25
26 namespace cldnn { namespace gpu {
27
28 struct fused_conv_eltwise_gpu : typed_primitive_gpu_impl<fused_conv_eltwise>
29 {
30     using parent = typed_primitive_gpu_impl<fused_conv_eltwise>;
31     using parent::parent;
32
33 protected:
34
35     virtual bool validate_impl(const typed_primitive_inst<fused_conv_eltwise>& instance) const override
36     {
37         bool res = true;
38
39         auto outer_id = _outer.id();
40         auto data_type = instance.node.input().get_output_layout().data_type;
41
42         // Check whether all memory elements use the same unit type (FP16 or FP32).
43         CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "output memory", instance.node.get_output_layout().data_type, "");
44         CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, "");
45
46         return res;
47     }
48
49     virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst<fused_conv_eltwise>& instance, int32_t split) const override
50     {
51         kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
52
53         args.weights              = &instance.weights_memory(split);
54         args.bias                 = instance.bias_term() ? &instance.bias_memory(split) : nullptr;
55         args.weights_quantization_factors = instance.weights_quantization_factors_term() ? &instance.weights_quantization_factors_memory(split) : nullptr;
56         args.output_calibration_factors = instance.conv_output_calibration_factors_term() ? &instance.output_calibration_factors_memory(split) : nullptr;
57         if (instance.eltw_output_calibration_factors_term())
58             args.fused_op_calibration_factors.push_back(&instance.eltw_output_calibration_factors_memory());
59         return args;
60     }
61
62     virtual int32_t get_split() const override
63     { 
64         return _outer.get_split(); 
65     }
66
67 public:
68
69     static primitive_impl* create(const fused_conv_eltwise_node &arg)
70     {
71         const auto& primitive       = arg.get_primitive();
72         const auto& input_layout    = arg.input().get_output_layout();
73         const auto& weights_layout  = arg.weights(0).get_output_layout();
74         const auto& weights_size    = weights_layout.size;
75
76         const auto& split           = primitive->split();
77         const auto& stride          = primitive->conv.stride;
78         const auto& dilation        = primitive->conv.dilation;
79         const auto& input_offset    = primitive->conv.input_offset;
80
81         const auto depthwise_separable_opt = arg.get_depthwise_sep_opt();
82         const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split;
83
84         const auto transposed = arg.get_transposed();
85
86         assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]);
87
88         // conv params
89         auto fused_params = get_weights_bias_default_params<kernel_selector::fused_conv_eltwise_params>(arg, actual_split);
90         // add second input for eltwise
91         if (!static_cast<const fused_conv_eltwise*>(arg.get_primitive().get())->second_input_in_output)
92         {
93             fused_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
94         }
95
96         auto& conv_params = fused_params.conv;
97         auto& eltw_params = fused_params.eltw;
98
99         auto conv_optional_params = get_default_weights_bias_optional_params<kernel_selector::fused_conv_eltwise_optional_params>(arg.get_program());
100
101         const auto additional_offset = tensor::max(input_offset, 0);
102         if (additional_offset != 0)
103         {
104             fused_params.inputs[0] = convert_data_tensor(input_layout, actual_split, additional_offset);
105         }
106
107         if (primitive->conv.with_activation)
108         {
109             convert_activation_func_params(&primitive->conv, fused_params.activation);
110         }
111         if (primitive->eltw.with_activation)
112         {
113             convert_activation_func_params(&primitive->eltw, fused_params.eltw.activation);
114         }
115
116         fused_params.conv.depthwise_separable_opt = depthwise_separable_opt;
117         fused_params.conv.transposed = transposed;
118
119         fused_params.second_input_in_output = primitive->second_input_in_output;
120
121         conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1;
122         conv_params.split = split;
123         conv_params.filterSize = {
124             (uint32_t)weights_size.spatial[0],
125             (uint32_t)weights_size.spatial[1],
126         };
127
128         conv_params.padding = {
129             (uint32_t)std::max(-input_offset.spatial[0], 0),
130             (uint32_t)std::max(-input_offset.spatial[1], 0)
131         };
132
133         conv_params.stride = {
134             (uint32_t)stride.spatial[0],
135             (uint32_t)stride.spatial[1]
136         };
137         conv_params.dilation = {
138             (uint32_t)dilation.spatial[0],
139             (uint32_t)dilation.spatial[1]
140         };
141         
142         if (primitive->conv.weights_quantization_factors.size() > 0)
143         {
144             conv_params.int8_quantization = true;
145             conv_params.weights_quantization_factors.push_back(convert_data_tensor(arg.weights_quantization_factors().get_output_layout()).FlattenFeatureAndSpatials());
146             conv_params.input_quantization_factor = arg.get_conv_input_qf();
147
148             if (primitive->conv.output_calibration_factors.size() > 0)
149             {
150                 conv_params.output_calibration = true;
151                 conv_params.output_calibration_factors.push_back(convert_data_tensor(arg.conv_output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
152             }
153             else
154                 conv_params.output_quantization_factor = arg.get_conv_output_qf();
155         }
156
157         // eltw params
158         if (primitive->eltw.output_calibration_factors.size() > 0 || primitive->eltw.output_quantization_factor != 1.0f)
159         {
160             eltw_params.int8_quantization = true;
161
162             if (primitive->eltw.output_calibration_factors.size() > 0)
163             {
164                 eltw_params.output_calibration = true;
165                 eltw_params.output_calibration_factors.push_back(convert_data_tensor(arg.eltw_output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
166             }
167             else
168                 eltw_params.output_quantization_factor = arg.get_eltw_output_qf();
169         }
170
171         // stride
172         if (!primitive->eltw.stride.empty())
173         {
174             const auto& eltw_stride = primitive->eltw.stride;
175             eltw_params.stride.resize(eltw_stride.size());
176             for (size_t i = 0; i < primitive->eltw.stride.size(); i++)
177             {
178                 eltw_params.stride[i] = { (uint32_t)eltw_stride[i].spatial[0], (uint32_t)eltw_stride[i].spatial[1] };
179             }
180         }
181
182         auto& kernel_selector = kernel_selector::fused_conv_eltwise_kernel_selector::Instance();
183
184         const auto& tuning_config = arg.get_program().get_options().get<build_option_type::tuning_config>();
185
186         if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache)
187         {
188             conv_optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), true);
189         }
190
191         kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(fused_params, conv_optional_params);
192                 
193         CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments");
194
195         auto conv = new fused_conv_eltwise_gpu(arg, best_kernels[0]);
196
197         return conv;
198     }
199 };
200
201 namespace{
202     struct attach {
203         attach() {
204             implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), fused_conv_eltwise_gpu::create);
205             implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), fused_conv_eltwise_gpu::create);
206             implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), fused_conv_eltwise_gpu::create);
207             // MMAD
208             implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), fused_conv_eltwise_gpu::create);
209         }
210         ~attach() {}
211     };
212     attach attach_impl;
213 }
214 } }