2 // Copyright (c) 2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
20 #include "api/CPP/proposal.hpp"
21 #include "api/CPP/roi_pooling.hpp"
22 #include "api/CPP/reorg_yolo.hpp"
23 #include "api/CPP/eltwise.hpp"
24 #include "upsampling_inst.h"
25 #include "pass_manager.h"
26 #include "program_node.h"
27 #include "layout_optimizer.h"
28 #include "program_impl.h"
29 #include "program_helpers.h"
31 using namespace cldnn;
33 //ToDo remove friendship relation from program_impl
35 reorder_inputs::reorder_inputs(layout_optimizer& lo_ref) : base_pass("reorder_inputs"), _lo(lo_ref) {}
37 void reorder_inputs::run(program_impl& p) {
41 void reorder_inputs::run(program_impl& p, layout_optimizer& lo)
43 //first pass to set layout optimization_attributes for topology
44 for (auto& node : p.get_processing_order())
47 if (prim.type() == cldnn::convolution::type_id())
49 if (prim.as<convolution>().get_primitive()->split() > 1)
50 lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1);
53 //list of layers that do not support yxfb or perform worse than bfyx
54 if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() ||
55 prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() ||
56 prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id())
57 lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1);
60 const auto reorder_input = [&p, &lo](typed_program_node<convolution>& conv_node)
62 auto conv_prim = conv_node.get_primitive();
63 auto& input_node = conv_node.get_dependency(0);
64 auto&& weights_layout = conv_node.weights(0).get_output_layout();
65 auto&& input_layout = input_node.get_output_layout();
67 std::shared_ptr<reorder> new_input = nullptr;
69 if (input_node.type() == reorder::type_id()) //convolution's input is a reorder
71 auto reorder_prim = input_node.as<reorder>().typed_desc();
72 auto& reorder_input = input_node.get_dependency(0);
73 auto reorder_layout = input_node.get_output_layout();
74 reorder_layout.data_type = *reorder_prim->output_data_type;
75 new_input = lo.get_reorder(
78 layout_optimizer::data_type::input,
80 weights_layout).first;
82 auto reorder_removed = false;
83 if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal
85 auto reorder_input_layout = reorder_input.get_output_layout();
87 auto opt_layout = layout(*new_input->output_data_type, new_input->output_format, reorder_input_layout.size);
88 if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format
90 if (reorder_prim->subtract_per_feature.empty() &&
91 reorder_prim->mean.empty() &&
92 !reorder_prim->output_padding) //just plain reorder
94 conv_node.replace_dependency(0, reorder_input);
95 if (input_node.get_users().size() == 0 && !input_node.is_output())
97 reorder_removed = p.extract_and_remove(input_node);
101 else //change reorder's output layout
103 reorder_prim->output_format = opt_layout.format;
104 reorder_prim->output_data_type = opt_layout.data_type;
108 else //current reorder gives bad output, simply change it
110 reorder_prim->output_format = opt_layout.format;
111 reorder_prim->output_data_type = opt_layout.data_type;
116 if (!reorder_removed)
117 input_node.recalc_output_layout();
119 conv_node.recalc_output_layout();
123 new_input = lo.get_reorder(
124 input_node.get_output_layout(),
126 layout_optimizer::data_type::input,
128 weights_layout).first;
131 if (new_input && new_input->output_format == format::winograd_2x3_s1_data)
133 auto lower_size = (conv_prim->input_offset.negate() + input_layout.size);
135 tensor upper_input_padding = tensor{ 0 };
136 upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2; //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0
137 upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8; //for y, y - 2 % 8 == 0 must hold
139 p.apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() });
141 auto winograd_output = std::make_shared<reorder>("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format,
142 input_layout.data_type, std::vector<float>{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding);
143 conv_node.output_layout.data_padding = padding{};
144 program_node& back_node = p.get_or_create(winograd_output);
145 p.get_processing_order().insert_next(&conv_node, &back_node);
147 auto bias_term = conv_node.bias_term();
148 //create additional eltwise node after reorder to compute bias
151 auto& bias_node = conv_node.get_dependency(2);
152 std::vector<primitive_id> inputs = { back_node.id(), bias_node.id() };
153 auto winograd_output_biases = std::make_shared<eltwise>(back_node.id() + "_bias", inputs,
154 cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope,
155 back_node.get_output_layout().data_padding);
156 back_node.get_output_layout().data_padding = padding{};
157 auto& back_bias_node = p.get_or_create(winograd_output_biases);
158 p.get_processing_order().insert_next(&back_node, &back_bias_node);
159 p.replace_all_usages(back_node, back_bias_node);
160 p.add_connection(back_node, back_bias_node);
161 p.add_connection(bias_node, back_bias_node);
162 conv_node.invalidate_users();
163 p.replace_all_usages(conv_node, back_bias_node);
166 if (conv_prim->with_activation)
168 conv_node.typed_desc()->with_activation = false;
170 back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope });
175 conv_node.invalidate_users();
176 p.replace_all_usages(conv_node, back_node);
178 p.add_connection(conv_node, back_node);
180 auto& r_node = p.get_or_create(new_input);
181 r_node.as<reorder>().set_input_offset(conv_prim->input_offset);
185 p.swap_names(conv_node, back_node);
186 if (conv_node.is_output())
188 conv_node.set_output(false);
189 back_node.set_output(true);
190 for (auto& output : p.get_outputs())
192 if (output == &conv_node)
202 conv_node.remove_dependency(2);
203 auto& back_bias_node = *(p.nodes_map.find(back_node.id() + "_bias")->second);
204 p.swap_names(conv_node, back_bias_node);
205 if (conv_node.is_output())
207 conv_node.set_output(false);
208 back_bias_node.set_output(true);
209 for (auto& output : p.get_outputs())
211 if (output == &conv_node)
213 output = &back_bias_node;
221 if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf))
223 auto conv1x1_output = std::make_shared<reorder>("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type);
224 auto& back_node = p.get_or_create(conv1x1_output);
225 p.get_processing_order().insert_next(&conv_node, &back_node);
226 conv_node.invalidate_users();
227 p.replace_all_usages(conv_node, back_node);
228 p.add_connection(conv_node, back_node);
233 auto& r_node = p.get_or_create(new_input);
234 p.add_intermediate(r_node, conv_node, 0, r_node.get_dependencies().empty());
235 conv_node.recalc_output_layout();
239 const auto reorder_input_detection_output = [&p, &lo](typed_program_node<detection_output>& detection_output_node)
241 auto detection_output_prim = detection_output_node.get_primitive();
243 for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++)
245 auto& input = detection_output_node.get_dependency(i);
246 std::shared_ptr<reorder> new_input = lo.get_reorder(
247 input.get_output_layout(),
249 layout_optimizer::data_type::input,
250 detection_output_node,
251 layout{ data_types::f32, format::bfyx, tensor{} }).first;
255 p.add_intermediate(new_input, detection_output_node, i);
260 for (auto& prim : p.get_processing_order())
262 //there's an assumption that only convolution will take data/input_layout as input
263 //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above
264 program_helpers::do_for_types<convolution, detection_output>(*prim,
265 reorder_input, //case for convolution
266 reorder_input_detection_output //case for detection-output