Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / graph_optimizer / reorder_inputs.cpp
1 /*
2 // Copyright (c) 2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18
19
20 #include "api/CPP/proposal.hpp"
21 #include "api/CPP/roi_pooling.hpp"
22 #include "api/CPP/reorg_yolo.hpp"
23 #include "api/CPP/eltwise.hpp"
24 #include "upsampling_inst.h"
25 #include "pass_manager.h"
26 #include "program_node.h"
27 #include "layout_optimizer.h"
28 #include "program_impl.h"
29 #include "program_helpers.h"
30
31 using namespace cldnn;
32
33 //ToDo remove friendship relation from program_impl
34
35 reorder_inputs::reorder_inputs(layout_optimizer& lo_ref) : base_pass("reorder_inputs"), _lo(lo_ref) {}
36
37 void reorder_inputs::run(program_impl& p) {
38     run(p, _lo);
39 }
40
41 void reorder_inputs::run(program_impl& p, layout_optimizer& lo)
42 {
43     //first pass to set layout optimization_attributes for topology
44     for (auto& node : p.get_processing_order())
45     {
46         auto& prim = *node;
47         if (prim.type() == cldnn::convolution::type_id())
48         {
49             if (prim.as<convolution>().get_primitive()->split() > 1)
50                 lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1);
51         }
52
53         //list of layers that do not support yxfb or perform worse than bfyx
54         if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() ||
55             prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() ||
56             prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id())
57             lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1);
58     }
59
60     const auto reorder_input = [&p, &lo](typed_program_node<convolution>& conv_node)
61     {
62         auto conv_prim = conv_node.get_primitive();
63         auto& input_node = conv_node.get_dependency(0);
64         auto&& weights_layout = conv_node.weights(0).get_output_layout();
65         auto&& input_layout = input_node.get_output_layout();
66
67         std::shared_ptr<reorder> new_input = nullptr;
68
69         if (input_node.type() == reorder::type_id()) //convolution's input is a reorder
70         {
71             auto reorder_prim = input_node.as<reorder>().typed_desc();
72             auto& reorder_input = input_node.get_dependency(0);
73             auto reorder_layout = input_node.get_output_layout();
74             reorder_layout.data_type = *reorder_prim->output_data_type;
75             new_input = lo.get_reorder(
76                 reorder_layout,
77                 reorder_prim->id,
78                 layout_optimizer::data_type::input,
79                 conv_node,
80                 weights_layout).first;
81
82             auto reorder_removed = false;
83             if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal
84             {
85                 auto reorder_input_layout = reorder_input.get_output_layout();
86
87                 auto opt_layout = layout(*new_input->output_data_type, new_input->output_format, reorder_input_layout.size);
88                 if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format
89                 {
90                     if (reorder_prim->subtract_per_feature.empty() &&
91                         reorder_prim->mean.empty() &&
92                         !reorder_prim->output_padding) //just plain reorder
93                     {
94                         conv_node.replace_dependency(0, reorder_input);
95                         if (input_node.get_users().size() == 0 && !input_node.is_output())
96                         {
97                             reorder_removed = p.extract_and_remove(input_node);
98                         }
99                         new_input = nullptr;
100                     }
101                     else //change reorder's output layout
102                     {
103                         reorder_prim->output_format = opt_layout.format;
104                         reorder_prim->output_data_type = opt_layout.data_type;
105                         new_input = nullptr;
106                     }
107                 }
108                 else //current reorder gives bad output, simply change it
109                 {
110                     reorder_prim->output_format = opt_layout.format;
111                     reorder_prim->output_data_type = opt_layout.data_type;
112                     new_input = nullptr;
113                 }
114             }
115
116             if (!reorder_removed)
117                 input_node.recalc_output_layout();
118             else
119                 conv_node.recalc_output_layout();
120         }
121         else
122         {
123             new_input = lo.get_reorder(
124                 input_node.get_output_layout(),
125                 input_node.id(),
126                 layout_optimizer::data_type::input,
127                 conv_node,
128                 weights_layout).first;
129         }
130
131         if (new_input && new_input->output_format == format::winograd_2x3_s1_data)
132         {
133             auto lower_size = (conv_prim->input_offset.negate() + input_layout.size);
134
135             tensor upper_input_padding = tensor{ 0 };
136             upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2;          //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0
137             upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8;    //for y, y - 2 % 8 == 0 must hold
138
139             p.apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() });
140
141             auto winograd_output = std::make_shared<reorder>("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format,
142                 input_layout.data_type, std::vector<float>{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding);
143             conv_node.output_layout.data_padding = padding{};
144             program_node& back_node = p.get_or_create(winograd_output);
145             p.get_processing_order().insert_next(&conv_node, &back_node);
146
147             auto bias_term = conv_node.bias_term();
148             //create additional eltwise node after reorder to compute bias
149             if (bias_term)
150             {
151                 auto& bias_node = conv_node.get_dependency(2);
152                 std::vector<primitive_id> inputs = { back_node.id(), bias_node.id() };
153                 auto winograd_output_biases = std::make_shared<eltwise>(back_node.id() + "_bias", inputs,
154                     cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope,
155                     back_node.get_output_layout().data_padding);
156                 back_node.get_output_layout().data_padding = padding{};
157                 auto& back_bias_node = p.get_or_create(winograd_output_biases);
158                 p.get_processing_order().insert_next(&back_node, &back_bias_node);
159                 p.replace_all_usages(back_node, back_bias_node);
160                 p.add_connection(back_node, back_bias_node);
161                 p.add_connection(bias_node, back_bias_node);
162                 conv_node.invalidate_users();
163                 p.replace_all_usages(conv_node, back_bias_node);
164             }
165
166             if (conv_prim->with_activation)
167             {
168                 conv_node.typed_desc()->with_activation = false;
169                 if (!bias_term)
170                     back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope });
171             }
172
173             if (!bias_term)
174             {
175                 conv_node.invalidate_users();
176                 p.replace_all_usages(conv_node, back_node);
177             }
178             p.add_connection(conv_node, back_node);
179
180             auto& r_node = p.get_or_create(new_input);
181             r_node.as<reorder>().set_input_offset(conv_prim->input_offset);
182
183             if (!bias_term)
184             {
185                 p.swap_names(conv_node, back_node);
186                 if (conv_node.is_output())
187                 {
188                     conv_node.set_output(false);
189                     back_node.set_output(true);
190                     for (auto& output : p.get_outputs())
191                     {
192                         if (output == &conv_node)
193                         {
194                             output = &back_node;
195                             break;
196                         }
197                     }
198                 }
199             }
200             else
201             {
202                 conv_node.remove_dependency(2);
203                 auto& back_bias_node = *(p.nodes_map.find(back_node.id() + "_bias")->second);
204                 p.swap_names(conv_node, back_bias_node);
205                 if (conv_node.is_output())
206                 {
207                     conv_node.set_output(false);
208                     back_bias_node.set_output(true);
209                     for (auto& output : p.get_outputs())
210                     {
211                         if (output == &conv_node)
212                         {
213                             output = &back_bias_node;
214                             break;
215                         }
216                     }
217                 }
218             }
219         }
220
221         if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf))
222         {
223             auto conv1x1_output = std::make_shared<reorder>("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type);
224             auto& back_node = p.get_or_create(conv1x1_output);
225             p.get_processing_order().insert_next(&conv_node, &back_node);
226             conv_node.invalidate_users();
227             p.replace_all_usages(conv_node, back_node);
228             p.add_connection(conv_node, back_node);
229         }
230
231         if (new_input)
232         {
233             auto& r_node = p.get_or_create(new_input);
234             p.add_intermediate(r_node, conv_node, 0, r_node.get_dependencies().empty());
235             conv_node.recalc_output_layout();
236         }
237     };
238
239     const auto reorder_input_detection_output = [&p, &lo](typed_program_node<detection_output>& detection_output_node)
240     {
241         auto detection_output_prim = detection_output_node.get_primitive();
242
243         for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++)
244         {
245             auto& input = detection_output_node.get_dependency(i);
246             std::shared_ptr<reorder> new_input = lo.get_reorder(
247                 input.get_output_layout(),
248                 input.id(),
249                 layout_optimizer::data_type::input,
250                 detection_output_node,
251                 layout{ data_types::f32, format::bfyx, tensor{} }).first;
252
253             if (new_input)
254             {
255                 p.add_intermediate(new_input, detection_output_node, i);
256             }
257         }
258     };
259
260     for (auto& prim : p.get_processing_order())
261     {
262         //there's an assumption that only convolution will take data/input_layout as input
263         //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above
264         program_helpers::do_for_types<convolution, detection_output>(*prim,
265             reorder_input,                  //case for convolution
266             reorder_input_detection_output  //case for detection-output
267             );
268     }
269 }