2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
19 #include "program_impl.h"
20 #include "primitive_inst.h"
21 #include "layout_optimizer.h"
22 #include "constants_propagator.h"
24 #include "primitive_type.h"
25 #include "api/CPP/activation.hpp"
26 #include "api/CPP/eltwise.hpp"
27 #include "api/CPP/input_layout.hpp"
28 #include "api/CPP/pooling.hpp"
29 #include "api/CPP/proposal.hpp"
30 #include "api/CPP/roi_pooling.hpp"
31 #include "api/CPP/reorg_yolo.hpp"
33 #include "activation_inst.h"
34 #include "batch_norm_inst.h"
35 #include "internal_primitive.h"
36 #include "internal_primitive_type_base.h"
37 #include "convolution_inst.h"
38 #include "concatenation_inst.h"
39 #include "crop_inst.h"
40 #include "data_inst.h"
41 #include "mutable_data_inst.h"
42 #include "deconvolution_inst.h"
43 #include "detection_output_inst.h"
45 #include "normalize_inst.h"
46 #include "permute_inst.h"
47 #include "prior_box_inst.h"
48 #include "reorder_inst.h"
49 #include "reshape_inst.h"
50 #include "scale_inst.h"
51 #include "embed_inst.h"
52 #include "softmax_inst.h"
53 #include "split_inst.h"
54 #include "program_dump_graph.h"
55 #include "upsampling_inst.h"
56 #include "eltwise_inst.h"
57 #include "fully_connected_inst.h"
59 #include "lstm_inst.h"
60 #include "lstm_gemm_inst.h"
61 #include "lstm_elt_inst.h"
62 #include "embed_inst.h"
64 #include "network_impl.h"
65 #include "kernel_selector_helper.h"
66 #include "sliding_window_utils.h"
67 #include "error_handler.h"
79 //helper function for selecting function basing on the type of the given primitive
80 //this is the termination case for parameter pack recurrence, see overload below for logic
82 void do_for_types(program_node&)
87 //helper function for selecting function basing on the type of the given primitive
88 //this function should be explicitly given set of types and implicitly set of functions.
89 //both sets should have equal size. First function will be called if type of the given primitive
90 //will match first explicitly given type, second will be called if it matches second explicitly given
92 //Functions given as arguments should themselves take std::shared_ptr<const T> as argument
93 //where T is the type that should be match if this function should be called
100 // [](typed_program_node<convolution>&){ do something if 'primitive' is a convolution },
101 // [](typed_program_node<pooling>&) { do something if 'primitive' is a pooling }
103 template <class T, class... RestOfT, class Func, class... RestOfFuncs>
104 decltype(static_cast<void>(std::declval<Func>()(std::declval<typed_program_node<T>&>()))) do_for_types(
107 RestOfFuncs const&... rest)
109 if (node.type() == T::type_id())
112 do_for_types<RestOfT...>(node, rest...);
116 struct single_element_container
118 single_element_container(T& t) : elem(&t)
121 constexpr size_t size() const { return 1; }
122 single_element_container<T> begin() const { return single_element_container(elem); }
123 single_element_container<T> end() const { return single_element_container(nullptr); }
124 single_element_container<T>& operator ++() { elem = nullptr; return *this; }
125 bool operator !=(single_element_container const& sec) { return elem != sec.elem; }
127 T operator *() { return *elem; }
130 single_element_container(T* t) : elem(t)
136 //helper function which creates single-element array if it's given anything
137 //other than std::vector.
138 //It should be used in generic code when there's a need to force vector usage
139 //in foreach loop over variable which can in one context be a vector or a scalar
143 // for (auto& string : wrap_if_single(t.dump()))
144 //depending on type T, t.dump() may return either std::string or std::vector<std::string>,
145 //to ensure compatibility between these cases, wrap_if_single will create single-element
146 //container in case t.dump() would return plain std::string.
148 // T& case -> returns container which holds T&
150 single_element_container<T> wrap_if_single(T& t)
152 return single_element_container<T>(t);
155 //helper function which creates single-element array if it's given anything
156 //other than std::vector.
157 // T const& case -> returns container which holds T const&
159 single_element_container<T const> wrap_if_single(T const& t)
161 return single_element_container<T const>(t);
164 //helper function which creates single-element array if it's given anything
165 //other than std::vector.
166 // T&& case -> returns container which holds new instance of T created by moving given param
168 single_element_container<T> wrap_if_single(T&& t)
170 static_assert(meta::always_false<T>::value, "Wrapping temporary object into single_element_container is an error (requires valid reference)");
171 return single_element_container<T>(t);
174 //helper function which creates single-element array if it's given anything
175 //other than std::vector.
176 // std::vector case -> does not wrap, returns t as-is
177 primitive::fixed_size_vector_ref const& wrap_if_single(primitive::fixed_size_vector_ref const& t)
182 //helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
183 void merge_buffers(engine_impl::ptr engine, program_node &node, layout target_layout, size_t begin_offset, size_t end_offset)
185 memory_impl::ptr data_to_allocate = engine->allocate_memory(target_layout);
187 for (size_t i = begin_offset; i < end_offset; i++)
189 auto& weights = node.get_dependency(i).as<data>();
190 mem_lock<char> src{ weights.get_attached_memory() };
191 mem_lock<char> dst{ data_to_allocate };
192 std::copy(src.begin(), src.end(), dst.begin() + (i - begin_offset)*src.size());
195 for (size_t i = 0; i < end_offset - begin_offset - 1; i++)
196 node.remove_dependency(begin_offset + 1);
198 auto& data_node = node.get_dependency(begin_offset).as<data>();
199 data_node.attach_memory(*data_to_allocate, false);
202 //helper function for getting target layout used in depthwise sep optimization
203 layout get_weights_layout(typed_program_node<cldnn::data> &data_node, int32_t split)
205 auto mem_layout = data_node.get_output_layout();
207 return layout(mem_layout.data_type, mem_layout.format, { split * mem_layout.size.batch[0], mem_layout.size.feature[0], mem_layout.size.spatial[0], mem_layout.size.spatial[1] });
210 // pair.first tells whether l1 and l2 are absolutely identical
211 // pair.second tells whether l1 and l2 can be reinterpreted to each other without need of reordering
212 // note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted nor dropped)
213 // note: if layouts describe two buffers with different size, consider them not to be identical even if smaller buffer can be considered to hold subsequence of larger buffer,
214 // this behavior is required to force buffer allocation for smaller buffer which, currently, should always be performed
215 std::pair<bool, bool> are_layouts_identical(layout const& l1, layout const& l2)
218 return{ true, true };
219 if (l1.data_type != l2.data_type)
220 return{ false, false };
221 if (l1.size != l2.size)
222 return{ false, false };
223 if (l1.get_linear_size() != l2.get_linear_size())
224 return{ false, false };
225 if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
226 (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16))
227 return{ false, false };
229 auto l1_pitch = l1.get_pitches();
230 auto l2_pitch = l2.get_pitches();
232 //ignore pitches which will never be used (for dims with size == 1)
233 for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i)
234 if (l1.size.raw[i] == 1)
236 for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i)
237 if (l2.size.raw[i] == 1)
240 auto l1_offset = l1.get_linear_offset();
241 auto l2_offset = l2.get_linear_offset();
242 if (l1_pitch == l2_pitch && l1_offset == l2_offset)
243 return{ false, true };
245 return{ false, false };
249 program_impl::program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal)
250 : engine(&engine_ref), options(options), output_size_handling_enabled(true)
252 static std::atomic<uint32_t> id_gen{ 0 };
254 assert(prog_id != 0);
256 if ((options.get<build_option_type::tuning_config>()->config.mode == tuning_mode::tuning_tune_and_cache) &&
257 !engine->configuration().enable_profiling)
259 throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!");
262 init_graph(topology);
263 pre_optimize_graph();
265 post_optimize_graph();
267 engine->compile_program(*this);
268 this->dump_program("13_finished", true);
270 //Makes serialization with given name.
271 //Placeholder, not working yet, in progress.
272 auto serialization_network_name = get_serialization_network_name(options);
273 if (!serialization_network_name.empty() && !is_internal)
275 this->serialize(serialization_network_name);
281 // TODO: Remove once we will get full support for input/output padding in all primitive implementations.
282 void program_impl::analyze_output_size_handling_need()
284 bool handling_needed = false;
286 // Calculate output size and compare with specified.
287 for (const auto& node : processing_order)
289 if (node->is_type<convolution>())
291 auto& prim_node = node->as<convolution>();
292 const auto& prim = prim_node.get_primitive();
294 if (!prim->with_output_size)
297 tensor specified_output_range({ 0, 0, prim->output_size.spatial[0], prim->output_size.spatial[1] }, 1);
299 auto filter_size = prim_node.weights(0).get_output_layout().size;
301 auto calc_output_range = calc_sliding_window_output_range<swor_mode::all>(
302 prim_node.input().get_output_layout().size,
303 filter_size, prim->input_offset, prim->stride, prim->dilation, true, 1);
305 if (specified_output_range != calc_output_range)
306 handling_needed = true;
308 else if (node->is_type<deconvolution>())
310 auto& prim_node = node->as<deconvolution>();
311 const auto& prim = prim_node.get_primitive();
313 if (!prim->with_output_size)
316 tensor specified_output_range({ 0, 0, prim->output_size.spatial[0], prim->output_size.spatial[1] }, 1);
318 auto filter_size = prim_node.weights(0).get_output_layout().size;
320 auto calc_output_range = calc_sliding_window_needed_input_range(
321 prim_node.input().get_output_layout().size,
322 filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
324 if (specified_output_range != calc_output_range)
325 handling_needed = true;
327 else if (node->is_type<pooling>())
329 auto& prim_node = node->as<pooling>();
330 const auto& prim = prim_node.get_primitive();
332 if (!prim->with_output_size)
335 tensor specified_output_range({ 0, 0, prim->output_size.spatial[0], prim->output_size.spatial[1] }, 1);
337 // TODO: Check compatibility of output size calculation (with caffe).
338 auto calc_output_range = calc_sliding_window_output_range<swor_mode::exceed_once_data>(
339 prim_node.input().get_output_layout().size,
340 prim->size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
342 if (specified_output_range != calc_output_range)
343 handling_needed = true;
347 output_size_handling_enabled = handling_needed;
350 std::list<std::shared_ptr<program_node>> program_impl::get_nodes() const
352 std::list<std::shared_ptr<program_node>> ret;
354 for (auto& node : processing_order)
355 ret.push_back(nodes_map.at(node->id()));
359 void program_impl::init_graph(topology_impl const& topology)
361 auto const& topo_map = topology.get_primitives();
362 for (auto const& prim : topo_map)
364 auto& n = get_or_create(prim.second);
365 inputs.push_back(&n);
369 for (auto itr = inputs.begin(); itr != inputs.end(); )
371 auto node_itr = itr++;
372 auto& node = (*node_itr);
373 auto deps = node->get_primitive()->dependencies();
377 //add pointers to node's dependencies
378 for (auto& dep : deps)
381 auto dep_node = nodes_map.at(dep);
382 node->dependencies.push_back(dep_node.get());
383 dep_node->users.push_back(node);
386 throw std::runtime_error("Program doesn't contain primitive: " + dep +
387 " that is input to: " + node->get_primitive()->id);
391 //primitive has dependencies so remove it from 'inputs'
392 inputs.erase(node_itr);
395 replace_nodes_post();
398 calc_processing_order();
400 dump_program("0_init", true);
402 calc_prior_boxes(); dump_program("1_calculated_prior_boxes", true);
405 dump_program("2_analyzed_graph", true);
408 void program_impl::pre_optimize_graph()
410 trim_to_outputs(); dump_program("3_trimmed", true);
411 calculate_BFS_processing_order();
412 analyze_output_size_handling_need();
413 for (auto& node : processing_order)
415 if (!node->is_type<internal_primitive>() && !node->is_type<data>())
416 node->get_output_layout();
419 if (options.get<build_option_type::optimize_data>()->enabled())
421 prepare_primitive_fusing();
422 layout_optimizer lo(output_size_handling_enabled);
424 // this code should move to post compilation after kernel selector will support handling reorder bias
425 pre_optimize_bias(lo);
426 dump_program("4_reordered_inputs", true);
430 remove_redundant_reorders(); dump_program("5_removed_redundant_reorders", true);
432 prepare_depthwise_sep_opt();
434 propagate_constants(); dump_program("6_propagated_constants", true);
436 //try to fuse buffers (i.e. depth_concat in bfyx format) after padding calculations
437 if (options.get<build_option_type::optimize_data>()->enabled())
439 prepare_buffer_fusing();
442 dump_program("7_pre_optimized", true);
445 void program_impl::compile_graph()
447 for (auto& node : processing_order)
449 if (!node->is_type<internal_primitive>() && !node->is_type<data>())
451 node->get_output_layout();
452 if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()))
453 node->selected_impl = node->type()->choose_impl(*engine, *node);
457 dump_program("8_compiled", true);
460 void program_impl::post_optimize_graph()
463 post_optimize_weights(lo); dump_program("9_reordered_weights", true);
464 remove_redundant_reorders(); dump_program("10_removed_redundant_reorders", true); //TODO: do we need it at this place also?
465 propagate_constants(); dump_program("11_propagated_constants", true);
466 prep_opt_depthwise_sep_post();
467 update_processing_numbers(); dump_program("12_validated_processing_order", true);
468 prepare_memory_dependencies();
471 void program_impl::cleanup()
473 for (auto& node : processing_order)
474 if (!node->is_type<internal_primitive>())
475 node->get_output_layout();
477 //in debug build, at the end, mark all nodes as outputs so user can query for buffers of all not-optimized nodes, including internal ones etc.
478 if (is_debug_build())
480 for (auto& node : processing_order)
482 if (!node->is_output())
484 node->set_output(true);
485 outputs.push_back(node);
491 std::string get_id_string(size_t i) {
492 std::stringstream ss;
493 ss << std::setw(5) << std::setfill('0') << i;
497 void program_impl::replace_nodes_pre()
499 auto itr = nodes_map.begin();
500 while (itr != nodes_map.end())
502 auto node_itr = itr++;
503 auto& node = (*node_itr).second;
505 //find split primitives and create crop primitives out of them
506 if (node->is_type<split>())
508 auto split_prim = node->as<split>().typed_desc();
509 primitive_id input_id = split_prim->input[0];
510 auto split_num = split_prim->output_offsets.size();
512 //create crop for each split ouptut provided
513 for (decltype(split_num) i = 0; i < split_num; i++)
515 primitive_id output_id = node->id() + ":" + split_prim->output_ids[i];
517 //create dummy crop primitive and add it to nodes map
518 auto crop_prim = std::make_shared<crop>(output_id, input_id, tensor{ 1,1,1,1 }, split_prim->output_offsets[i]);
519 get_or_create(crop_prim);
526 void program_impl::replace_nodes_post()
528 auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed
529 while (itr != nodes_map.end())
531 auto node_itr = itr++;
532 auto& node = (*node_itr).second;
534 //find split primitives and create crop primitives out of them
535 if (node->is_type<split>())
537 //check if split is not used by any primitive, as it will be optimized
538 if (node->get_users().size() != 0)
539 throw std::logic_error("Split layer cannot be used directly! Please use split output \"" + node->id() + ":<split_output_id>\"!");
541 //get_output size and validate split primitive inputs
542 auto output_layout = node->get_output_layout();
543 auto output_layout_size = output_layout.size;
545 auto split_prim = node->as<split>().typed_desc();
546 primitive_id input_id = split_prim->input[0];
547 auto split_num = split_prim->output_offsets.size();
549 //create crop for each split ouptut provided
550 for (decltype(split_num) i = 0; i < split_num; i++)
552 primitive_id output_id = node->id() + ":" + split_prim->output_ids[i];
554 auto node_ptr = nodes_map.find(output_id)->second;
556 //calculate crop reference input size
557 tensor reference_input_size;
559 for (decltype(split_num) j = 0; j < i; j++)
560 reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j];
562 for (decltype(split_num) j = i; j < split_num - 1; j++)
563 reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j];
565 reference_input_size = output_layout_size - reference_input_size;
567 //update crop primitive and add connections
568 node_ptr->set_output_padding(output_layout.data_padding);
569 auto crop_prim = node_ptr->as<crop>().typed_desc();
570 crop_prim->reference_input = reference_input_size;
572 add_connection(node->get_dependency(0), *node_ptr);
575 //remove input->split connection and remove original split node
576 remove_connection(node->get_dependency(0), *node);
577 optimized_out.push_back(node->id());
578 nodes_map.erase(node->id());
582 //find upsampling primitives with bilinear filtering and create deconvolution with proper weights instead
583 if (node->is_type<upsampling>())
585 auto upsampling_prim = node->as<upsampling>().typed_desc();
587 if (upsampling_prim->sample_type != upsampling_sample_type::bilinear)
590 //check if num_filter is not 0 (required for bilinear upsampling)
591 if (upsampling_prim->num_filter == 0)
592 throw std::logic_error("num_filter in upsampling cannot be 0 in bilinear filtering mode in \"" + node->id() + "\"!");
594 primitive_id upsampling_id = node->id();
595 auto& input_node = node->get_dependency(0);
597 primitive_id input_id = upsampling_prim->input[0];
598 auto num_filter = upsampling_prim->num_filter;
600 //setting deconvolution parameters based on upsampling input
601 auto scale = static_cast<tensor::value_type>(upsampling_prim->scale);
602 tensor stride(1, 1, scale, scale);
603 auto offset = static_cast<tensor::value_type>(std::ceil((scale - 1) / 2.f));
604 tensor input_offset(0, 0, -offset, -offset);
606 //setting weights for deconvolution
607 auto kernel_size = static_cast<tensor::value_type>((2 * scale) - (scale % 2));
608 layout weights_layout(data_types::f32, format::bfyx, tensor(1, 1, kernel_size, kernel_size));
610 std::vector<primitive_id> weights_vec;
611 for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++)
613 memory_impl::ptr data_to_allocate = engine->allocate_memory(weights_layout);
614 mem_lock<float> dst{ data_to_allocate };
615 float *dst_data = dst.data();
616 //initialize with bilinear weights data
617 auto f = static_cast<uint32_t>(std::ceil(kernel_size / 2.0f));
618 float c = (2 * f - 1 - f % 2) / (2.f * f);
621 for (size_t i = 0; i < weights_layout.count(); ++i) {
622 x = static_cast<float>(i % kernel_size);
623 y = static_cast<float>((i / kernel_size) % kernel_size);
624 dst_data[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
627 //create weights primitive, with dummy memory which will be replaced in firther step
628 primitive_id weights_id = upsampling_id + "_deconvolution_weights" + std::to_string(weights_idx);
629 layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1));
631 auto weights_prim = std::make_shared<data>(weights_id, memory::attach(dummy_layout, &zero, 1));
632 get_or_create(weights_prim);
634 weights_vec.push_back(weights_id);
636 auto weights_node_ptr = nodes_map.find(weights_id)->second;
638 //attach weights buffer
639 auto& data_node = weights_node_ptr->as<data>();
640 data_node.attach_memory(*data_to_allocate, false);
643 //remove upsampling node, rename it and move to the optimized list
644 remove_connection(node->get_dependency(0), *node);
645 auto rename_id = upsampling_id + "_tmp";
646 rename(*node, rename_id);
648 //create deconvolution primitive
649 auto deconv_prim = std::make_shared<deconvolution>(upsampling_id, input_id, weights_vec, stride, input_offset);
650 get_or_create(deconv_prim);
652 auto deconv_node_ptr = nodes_map.find(upsampling_id)->second;
654 auto upsampling_node_ptr = nodes_map.find(rename_id)->second;
655 replace_all_usages(*upsampling_node_ptr, *deconv_node_ptr);
656 optimized_out.push_back(rename_id);
657 nodes_map.erase(rename_id);
659 //add connections input->deconvolution and weights->deconvolution
660 add_connection(input_node, *deconv_node_ptr);
662 for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++)
664 auto weights_node_ptr = nodes_map.find(weights_vec[weights_idx])->second;
665 add_connection(*weights_node_ptr, *deconv_node_ptr);
670 //find deconvolution primitives with stride 1 and change them to convolution with trasposed weights
671 if (node->is_type<deconvolution>())
673 if (!options.get<build_option_type::optimize_data>()->enabled())
676 auto deconv_prim = node->as<deconvolution>().typed_desc();
678 //limit optimization to stride = 1
679 if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient())
682 primitive_id deconv_id = node->id();
683 auto& input_node = node->get_dependency(0);
685 primitive_id input_id = deconv_prim->input[0];
687 //setting convolution parameters based on deconvolution params
688 auto stride = deconv_prim->stride;
689 auto weights = deconv_prim->weights;
690 std::vector<primitive_id> weights_vec;
691 for (auto& weights_id : weights)
692 weights_vec.push_back(weights_id);
693 auto biases = deconv_prim->bias;
694 std::vector<primitive_id> bias_vec;
695 for (auto& bias_id : biases)
696 bias_vec.push_back(bias_id);
697 auto input_offset = deconv_prim->input_offset;
698 auto with_activation = deconv_prim->with_activation;
699 auto activation_negative_slope = deconv_prim->activation_negative_slope;
700 auto output_padding = deconv_prim->output_padding;
702 //remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list
703 tensor filter_size = { 1, 1, 1, 1 };
704 remove_connection(node->get_dependency(0), *node);
705 for (auto& weights_id : weights_vec)
707 auto weights_node_ptr = nodes_map.find(weights_id)->second;
708 remove_connection(*weights_node_ptr, *node);
709 //get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls have same size
710 if (weights_id == weights_vec[0])
711 filter_size = weights_node_ptr->get_output_layout().size;
714 input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
715 input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
717 if (!bias_vec.empty())
719 for (auto& bias_id : bias_vec)
721 auto bias_id_node_ptr = nodes_map.find(bias_id)->second;
722 remove_connection(*bias_id_node_ptr, *node);
725 auto rename_id = deconv_id + "_tmp";
726 rename(*node, rename_id);
728 //create convolution primitive
729 if (biases.size() != 0)
731 auto conv_prim = std::make_shared<convolution>(deconv_id, input_id, weights_vec, bias_vec,
732 stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding);
733 get_or_create(conv_prim);
737 auto conv_prim = std::make_shared<convolution>(deconv_id, input_id, weights_vec,
738 stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding);
739 get_or_create(conv_prim);
742 auto conv_node_ptr = nodes_map.find(deconv_id)->second;
743 auto conv_node = &conv_node_ptr->as<convolution>();
744 conv_node->set_transposed(true);
746 //add connections input->convolution, weights->convolution and bias->convolution
747 add_connection(input_node, *conv_node_ptr);
749 for (auto& weights_id : weights_vec)
751 auto weights_node_ptr = nodes_map.find(weights_id)->second;
752 add_connection(*weights_node_ptr, *conv_node_ptr);
755 if (!bias_vec.empty())
757 for (auto& bias_id : bias_vec)
759 auto bias_id_node_ptr = nodes_map.find(bias_id)->second;
760 add_connection(*bias_id_node_ptr, *conv_node_ptr);
764 auto deconv_node_ptr = nodes_map.find(rename_id)->second;
765 replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
766 optimized_out.push_back(rename_id);
767 nodes_map.erase(rename_id);
774 void program_impl::handle_lstm()
776 bool has_lstm_children;
777 auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed
778 while (itr != nodes_map.end())
780 auto node_itr = itr++;
781 auto& node = (*node_itr).second;
782 has_lstm_children = false;
783 // replace lstm node with lstm_gemm and lstm_elt nodes
784 if (node->is_type<lstm>()) {
785 bool initial_hidden_term = node->as<lstm>().initial_hidden_term();
786 bool initial_cell_term = node->as<lstm>().initial_cell_term();
787 bool bias_term = node->as<lstm>().bias_term();
788 auto lstm_prim = node->as<lstm>().typed_desc();
789 primitive_id weights_id = lstm_prim->weights;
790 primitive_id recurrent_id = lstm_prim->recurrent;
791 primitive_id bias_id = bias_term ? lstm_prim->bias : "";
792 primitive_id initial_hidden_id = initial_hidden_term ? lstm_prim->initial_hidden : "";
793 primitive_id initial_cell_id = initial_cell_term ? lstm_prim->initial_cell : "";
794 //removing connection with weights to get proper dependency order for next operations
795 remove_connection(*nodes_map.at(weights_id), *node);
796 remove_connection(*nodes_map.at(recurrent_id), *node);
798 remove_connection(*nodes_map.at(bias_id), *node);
799 if (initial_hidden_term)
800 remove_connection(*nodes_map.at(initial_hidden_id), *node);
801 if (initial_cell_term)
802 remove_connection(*nodes_map.at(initial_cell_id), *node);
805 auto input_size = node->get_dependency(0).get_output_layout().size;
806 auto recurrent_size = nodes_map.at(recurrent_id)->get_output_layout().size;
807 auto hidden_size = tensor(input_size.batch[0], 1, recurrent_size.spatial[0], input_size.feature[0]);
808 size_t directions = recurrent_size.feature[0];
809 size_t input_dependencies = node->get_dependencies().size();
810 size_t sequence_len = node->as<lstm>().sequence_len();
812 //if the sequence has a single element but it has multiple inputs then
813 //the parent of this lstm is an lstm node. If this is a bidirectional lstm
814 //then the sequence length is the number of dependencies divided by 2.
815 if (sequence_len == 1 && input_dependencies > 1)
816 sequence_len = (directions == 1) ? input_dependencies : input_dependencies / 2;
818 //check if this lstm node has an lstm child
819 for (auto& user : node->get_users())
821 if (user->is_type<lstm>())
823 has_lstm_children = true;
827 std::vector<program_node*> cell_list(directions * sequence_len);
828 std::vector<program_node*> concat_depends(directions * sequence_len);
829 std::vector<primitive_id> output_ids_offsets(directions * sequence_len);
831 primitive_id hidden_fwd_id = initial_hidden_id;
832 primitive_id hidden_bwd_id = initial_hidden_id;
833 primitive_id cell_fwd_id = initial_cell_id;
834 primitive_id cell_bwd_id = initial_cell_id;
836 auto split_direction = [&](const std::string gate, bool initial_term, primitive_id& fwd_id, primitive_id& bwd_id) {
838 primitive_id initial_id = fwd_id;
839 fwd_id = node->id() + ":" + gate + "_fwd";
840 auto fwd_node = std::make_shared<crop>(fwd_id, initial_id, hidden_size, tensor{ 0,0,0,0 });
841 auto &n1 = get_or_create(fwd_node);
842 add_connection(*nodes_map.at(initial_id), n1);
843 bwd_id = node->id() + ":" + gate + "_bwd";
844 auto bwd_node = std::make_shared<crop>(bwd_id, initial_id, hidden_size, tensor{ 0,1,0,0 });
845 auto &n2 = get_or_create(bwd_node);
846 add_connection(*nodes_map.at(initial_id), n2);
850 //if bidirectional lstm then initial_hidden and initial_cell terms need to be split
851 if (directions > 1) {
852 split_direction("hidden", initial_hidden_term, hidden_fwd_id, hidden_bwd_id);
853 split_direction("cell", initial_cell_term, cell_fwd_id, cell_bwd_id);
857 for (size_t dir = 0; dir < directions; ++dir) {
858 auto hidden_id = dir == 0 ? hidden_fwd_id : hidden_bwd_id;
859 auto cell_id = dir == 0 ? cell_fwd_id : cell_bwd_id;
860 for (size_t i = 0; i < sequence_len; ++i) {
861 size_t idx = i + dir * sequence_len;
862 primitive_id lstm_gemm_id = node->id() + ":lstm_gemm" + get_id_string(idx);
863 primitive_id lstm_elt_id = node->id() + ":lstm_elt" + get_id_string(idx);
864 primitive_id crop_id = node->id() + ":crop" + get_id_string(idx);
866 size_t input_idx = i;
867 //for bidirectional lstms, if first LSTM layer then reverse input
868 //for subsequent stacked layers the input is strided on the dir dimension
869 if (directions > 0) {
870 if (input_dependencies > sequence_len) { // stacked layer
871 input_idx = dir * sequence_len + i;
875 if (dir > 0) { // first layer
876 input_idx = sequence_len - i - 1;
880 primitive_id lstm_gemm_input_id = node->get_dependency(input_idx).get_org_primitive_id();
882 auto lstm_gemm_node = std::make_shared<lstm_gemm>(lstm_gemm_id, lstm_gemm_input_id, weights_id, recurrent_id, bias_id, hidden_id, (uint32_t)dir);
883 auto &n1 = get_or_create(lstm_gemm_node);
885 auto lstm_elt_node = std::make_shared<lstm_elt>(lstm_elt_id, lstm_gemm_id, cell_id, lstm_prim->clip, lstm_prim->input_forget,
886 lstm_prim->activations, lstm_prim->activation_params, lstm_prim->offset_order);
887 auto &n2 = get_or_create(lstm_elt_node);
888 //adding lstm_elt as user
889 add_connection(n1, n2);
890 //adding dependecy to lstm_gemm node
892 add_connection(node->get_dependency(input_idx), n1);
893 //adding weights and initial values to lstm_gemm
894 add_connection(*nodes_map.at(weights_id), n1);
895 add_connection(*nodes_map.at(recurrent_id), n1);
897 add_connection(*nodes_map.at(bias_id), n1);
899 //adding cell and hiddens as dependencies
902 add_connection(*cell_list[size_t(i - 1) * directions + dir], n2);
903 add_connection(*(concat_depends[size_t(i - 1) * directions + dir]), n1);
905 //if initial values are present
908 if (initial_hidden_term)
909 add_connection(*nodes_map.at(hidden_id), n1);
910 if (initial_cell_term)
911 add_connection(*nodes_map.at(cell_id), n2);
915 hidden_id = crop_id + ":hidden";
916 auto crop_hidden = std::make_shared<crop>(hidden_id, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 });
917 auto &n3 = get_or_create(crop_hidden);
918 //adding eltwise as dependency to hidden
919 add_connection(n2, n3);
921 //if parent is lstm adding hiddens as dependency
922 if (has_lstm_children)
924 for (auto& user : node->get_users())
926 add_connection(n3, *user);
929 concat_depends[i * directions + dir] = &n3;
932 if (i < sequence_len - 1) {
933 cell_id = crop_id + ":cell";
934 auto crop_cell = std::make_shared<crop>(cell_id, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 });
935 auto &n4 = get_or_create(crop_cell);
936 add_connection(n2, n4);
937 cell_list[i * directions + dir] = &n4;
939 output_ids_offsets[i * directions + dir] = hidden_id;
943 //if there is no next lstm, concatenation is created
944 if (!has_lstm_children)
946 primitive_id original_id = node->id();
947 primitive_id concatenation_id = original_id + ":concat";
948 auto concatenation_primitive = std::make_shared<concatenation>(concatenation_id, output_ids_offsets, concatenation::along_f);
949 auto &concatenation_node = get_or_create(concatenation_primitive);
950 for (auto sub_dependency : concat_depends)
952 add_connection(*sub_dependency, concatenation_node);
954 if (directions == 2) {
955 // bidirectional support requires concatenations along the direction and sequence axis
956 // instead we can concatenate along the sequence axis and reshape the tensor to the account
958 tensor output_size {input_size.batch[0], (int32_t)sequence_len, hidden_size.spatial[0], (int32_t)directions};
959 primitive_id reshape_id = original_id + ":reshape";
960 auto reshape_primitive = std::make_shared<reshape>(reshape_id, concatenation_id, output_size);
961 auto &reshape_node = get_or_create(reshape_primitive);
962 add_connection(concatenation_node, reshape_node);
963 for (auto& user : node->get_users())
965 add_connection(reshape_node, *user);
970 //removing expanded node
971 remove_all_connections(*node);
972 nodes_map.erase(node->id());
979 void program_impl::set_outputs()
981 auto outputs_option = options.get<build_option_type::outputs>();
982 if (!outputs_option->outputs.empty())
984 for (auto const& output : outputs_option->outputs)
986 auto o_node = nodes_map.at(output);
987 o_node->set_output(true);
988 outputs.push_back(o_node.get());
993 for (auto& node : nodes_map)
994 if (node.second->is_endpoint())
996 node.second->set_output(true);
997 outputs.push_back(node.second.get());
1002 void program_impl::calc_processing_order()
1004 processing_order.clear();
1006 //run dfs to sort nodes topologically
1007 for (auto input : inputs)
1009 if (input->is_marked())
1013 std::list<std::pair<program_node*, std::list<program_node*>::const_iterator>> stack = { std::make_pair(input, input->users.begin()) };
1015 while (!stack.empty()) //imitate call stack
1018 auto& frame = stack.back();
1020 while (frame.second != frame.first->users.end())
1022 auto successor = *frame.second;
1025 if (!successor->is_marked())
1030 stack.push_back(std::make_pair(successor, successor->users.begin()));
1035 //we have finished processing one node so add it to the processing queue
1036 processing_order.push_front(frame.first);
1037 frame.first->processing_itr = processing_order.begin();
1045 for (auto& node : processing_order)
1047 node->processing_num = ++idx;
1052 void program_impl::update_processing_numbers()
1055 for (auto& node : processing_order)
1057 node->processing_num = ++idx;
1060 for (auto& node : processing_order)
1062 if (!processing_order_is_correct(node))
1064 CLDNN_ERROR_MESSAGE(node->id(), "Incorrect processing order");
1070 void program_impl::calc_prior_boxes()
1072 auto itr = processing_order.begin();
1073 while (itr != processing_order.end())
1075 auto& node = (*itr++);
1076 if (!node->is_type<prior_box>())
1079 auto& pb_node = node->as<prior_box>();
1081 pb_node.calc_result();
1082 remove_connection(pb_node.input(), pb_node);
1084 auto& result = pb_node.get_result_buffer();
1085 result.add_ref(); // need to inc ref count since we will be assigning this memory as cldnn_memory in next line that is not ref_count_obj
1086 auto cpp_mem = details::memory_c_to_cpp_converter::convert(api_cast(&result));
1088 auto& data_node = get_or_create(std::make_shared<data>("_cldnn_tmp_" + pb_node.id() + "_result", cpp_mem));
1089 replace(pb_node, data_node, false, false);
1095 void program_impl::mark_constants()
1097 for (auto& node : processing_order)
1099 if (node->dependencies.empty())
1101 if (node->is_type<prior_box>())
1104 node->constant = true;
1105 for (auto& dep : node->get_dependencies())
1109 node->constant = false;
1114 if (!node->constant)
1115 for (auto& dep : node->get_dependencies())
1117 dep->constant_frontier = true;
1121 void program_impl::mark_data_flow()
1123 std::list<program_node*> stack;
1124 for (auto const& node : processing_order)
1126 if ((node->is_endpoint() && !node->constant) || node->is_type<mutable_data>())
1128 stack.push_back(node);
1129 node->data_flow = true;
1134 while (!stack.empty())
1136 auto node = stack.front();
1140 size_t inputs_count = (node->is_type<internal_primitive>() ? node->get_dependencies().size() : node->get_primitive()->input.size());
1141 //TODO: remove this hack after addition of constants propagation pass
1142 if (node->is_type<detection_output>() || node->is_type<proposal>())
1143 inputs_count = 2; //ignore third input as it is related to prior boxes (i.e. concat of prior-boxes)
1145 for (auto dep : node->get_dependencies())
1147 bool data_flow = (dep_idx < inputs_count && !dep->constant);
1152 dep->data_flow = data_flow;
1154 if (dep->is_marked())
1157 stack.push_back(dep);
1162 for (auto& node : processing_order)
1164 assert(!node->constant || !node->data_flow); //node which is constant cannot be marked as data flow
1169 void program_impl::trim_to_outputs()
1171 size_t actual_nodes = processing_order.size();
1172 if (!actual_nodes) //degenerated case but can happen
1175 if (outputs.size() == actual_nodes)
1178 //do backward bfs starting from all outputs
1179 std::list<const std::vector<program_node*>*> stack = { &outputs };
1180 while (!stack.empty())
1182 auto nodes_list = stack.front();
1185 for (auto node : *nodes_list)
1187 if (!node->is_marked())
1190 if (!node->get_dependencies().empty())
1191 stack.push_back(&node->get_dependencies());
1196 //all not-marked nodes should be removed
1197 std::list<program_node*> to_rem;
1198 for (auto node : processing_order)
1200 if (node->is_type<input_layout>()) //input layout may become disconnected during prior boxes calculations so it may have not been marked at this place but we don't want to remove it
1202 else if (!node->is_marked())
1203 to_rem.push_back(node);
1206 for (auto const& node : to_rem)
1208 if (node->is_input())
1209 inputs.remove(node);
1212 for (auto dep : node->dependencies)
1213 if (dep->is_marked())
1214 dep->users.remove(node);
1217 for (auto user : node->users)
1218 if (user->is_marked())
1219 user->dependencies.erase(std::remove(user->dependencies.begin(), user->dependencies.end(), node), user->dependencies.end());
1221 optimized_out.push_back(node->id());
1222 nodes_map.erase(node->id());
1226 void add_memory_dependency(program_node* node, program_node* dep)
1228 if (node->can_be_optimized() ||
1229 !dep->can_be_optimized())
1231 node->add_memory_dependency(dep->id());
1235 if (node->id() == dep->id())
1239 for (auto subdep : dep->get_dependencies())
1241 add_memory_dependency(node, subdep);
1242 add_memory_dependency(subdep, node);
1247 void program_impl::basic_memory_dependencies()
1249 auto itr = processing_order.begin();
1250 std::vector<primitive_id> past_outputs;
1251 while (itr != processing_order.end())
1256 //data primitive can't be reused
1257 if (node->is_type<data>())
1260 // add my dependencies to restriction list (can't share input.output buffers)
1261 for (auto it : node->get_dependencies())
1263 add_memory_dependency(node, it);
1264 add_memory_dependency(it, node);
1267 // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, this output
1268 // has to land on the primitve restriction list. Otherwise memory reuse can corrupt final results.
1269 node->add_memory_dependency(past_outputs);
1270 // if current node is an output add it to the outputs list after restriction.
1271 if (node->is_output())
1272 past_outputs.push_back(node->id());
1276 void program_impl::skipped_branch_memory_dependencies()
1278 auto itr = processing_order.begin();
1279 // Primitive A can't use primitive B buffer if B->processing_num < A->processing_num and any of B users processing_num > A->processing_num
1280 // Otherwise it could override data that has to be used in the future.
1281 // TODO: improve algorithm to to O(n*log(n))
1282 while (itr != processing_order.end())
1286 auto itr2 = processing_order.begin();
1289 while (itr2 != processing_order.end())
1291 auto& node2 = *itr2;
1293 if (node2->get_processing_num() < node->get_processing_num())
1295 // if at least one user will be processed after 'node', node2 has to be added to forbiden list
1296 for (auto usr : node2->get_users())
1298 if (usr->get_processing_num() > node->get_processing_num())
1300 add_memory_dependency(node, node2);
1301 add_memory_dependency(node2, node);
1310 void program_impl::oooq_memory_dependencies()
1312 auto itr = processing_order.begin();
1313 // This order let us build dependencies based on syncing points.
1314 // Set of nodes between two syncing points will be called sync_region.
1315 // Major rules is: can't share resource with nodes in my sync_region
1317 uint32_t last_barrier = 0;
1318 bool needs_barrier = false;
1319 std::vector<cldnn::program_node*> sync_region;
1320 while (itr != processing_order.end())
1325 // if any of dep has proccess num after barrier -> needs barrier
1326 for (auto dep : node->get_dependencies())
1328 if (dep->get_processing_num() >= last_barrier)
1330 needs_barrier = true;
1337 last_barrier = node->get_processing_num();
1338 needs_barrier = false;
1339 // add each pair bi-direction dependency
1340 for (auto nd1 = sync_region.begin(); nd1 + 1 != sync_region.end(); nd1++)
1342 for (auto nd2 = nd1 + 1; nd2 != sync_region.end(); nd2++)
1344 add_memory_dependency(*nd1, *nd2);
1345 add_memory_dependency(*nd2, *nd1);
1349 // collect dependencies of every node in sync region
1350 std::vector<cldnn::program_node*> deps;
1351 for (auto& nd_in_region : sync_region)
1352 for (auto& dep : nd_in_region->get_dependencies())
1353 deps.emplace_back(dep);
1356 for (auto& nd_in_region : sync_region)
1357 for (auto& dep : deps)
1359 add_memory_dependency(nd_in_region, dep);
1360 add_memory_dependency(dep, nd_in_region);
1363 sync_region.clear();
1365 sync_region.push_back(node);
1369 void program_impl::prepare_memory_dependencies()
1371 if (!get_engine().configuration().enable_memory_pool)
1374 basic_memory_dependencies();
1375 skipped_branch_memory_dependencies();
1376 oooq_memory_dependencies();
1379 std::string program_impl::get_memory_dependencies_string() const
1381 std::string mem_dep = "Memory dependencies/restrictions:\n";
1382 auto itr = processing_order.begin();
1383 while (itr != processing_order.end())
1387 mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: ");
1388 for (auto it : node->get_memory_dependencies())
1389 mem_dep == mem_dep.append(it).append(", ");
1390 mem_dep = mem_dep.append("\n");
1395 void program_impl::remove_redundant_reorders()
1397 auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed
1398 while (itr != processing_order.end())
1400 auto& node = (*itr++); //post-inc to avoid invalidation due to possible erase
1401 if (!node->is_type<reorder>()) //only care for reorders
1404 program_node* current_node = node;
1405 std::vector<program_node*> r_nodes_to_remove;
1407 auto optimize = true;
1408 while (current_node)
1410 auto& r_node = current_node->as<reorder>();
1411 current_node = nullptr;
1413 if (r_node.has_mean() || !r_node.get_primitive()->subtract_per_feature.empty() || //do not optimize if mean of subtract are present
1414 (r_node.is_output() && r_node.get_dependency(0).is_output())) //do not optimize when both reorder and layer before are outputs
1420 r_nodes_to_remove.push_back(&r_node);
1422 if (r_node.get_dependency(0).is_type<reorder>() && r_node.get_dependencies().size() == 1 && r_node.get_users().size() == 1 && r_node.get_dependency(0).get_users().size() == 1)
1423 current_node = &r_node.get_dependency(0);
1428 assert(node->dependencies.size() == 1 && "reorder without mean should have exactly one dependecy (input)");
1429 auto& r_output = r_nodes_to_remove.front();
1430 auto& r_input = r_nodes_to_remove.back()->get_dependency(0);
1431 auto o_layout = r_output->get_output_layout();
1432 auto i_layout = r_input.get_output_layout();
1434 auto ident = are_layouts_identical(o_layout, i_layout);
1438 for (auto remove_reorder_node : r_nodes_to_remove)
1440 auto& r_node = remove_reorder_node->as<reorder>();
1442 if (ident.first && ident.second && r_node.is_output() && r_node.get_dependency(0).is_input()) //do not optimize when reorder is output and layer before is input
1451 for (auto remove_reorder_node : r_nodes_to_remove)
1453 auto& r_node = remove_reorder_node->as<reorder>();
1456 r_node.can_be_optimized(true);
1457 r_node.requires_reinterpret(!ident.first);
1458 if (ident.first) //no need of reshape
1459 extract_and_remove(r_node); //try to remove if possible (with respect to r_node not being marked as output)
1465 recalculate processing_order
1466 algorithm based on: CLRS 24.5 (critical path in DAG)
1467 modifications: adjust for multiple inputs
1468 input: any topological order in processing order
1469 output: BFS topological order.
1472 void program_impl::calculate_BFS_processing_order() {
1473 std::map<program_node*, int> distances;
1474 for (auto itr : processing_order)
1476 distances[itr] = -1;
1478 int max_distance = 0;
1479 for (auto itr : processing_order)
1482 if (distances[itr] == -1) { // this must be an input
1483 distances[itr] = 0; // initialize input
1486 for (auto& user : itr->get_users())
1488 distances[user] = std::max(distances[user], distances[itr] + 1);
1489 max_distance = std::max(max_distance, distances[user]);
1493 //bucket sort nodes based on their max distance from input
1494 std::vector<std::vector<program_node*>> dist_lists;
1495 dist_lists.resize(max_distance + 1);
1496 for (auto itr : processing_order)
1498 dist_lists[distances[itr]].push_back(itr);
1501 //replace the old processing order by the new one, still topological.
1502 processing_order.clear();
1503 for (auto& dist : dist_lists)
1505 for (auto& node : dist)
1507 processing_order.push_back(node);
1508 node->processing_itr = processing_order.end();
1509 node->processing_itr--;
1512 update_processing_numbers();
1516 void program_impl::reorder_inputs(layout_optimizer& lo)
1518 //first pass to set layout optimization_attributes for topology
1519 for (auto& p : nodes_map)
1521 auto& prim = *p.second;
1522 if (prim.type() == cldnn::convolution::type_id())
1524 if (prim.as<convolution>().get_primitive()->split() > 1)
1525 lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1);
1528 //list of layers that do not support yxfb or perform worse than bfyx
1529 if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() ||
1530 prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() ||
1531 prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id())
1532 lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1);
1535 const auto reorder_input = [this, &lo](typed_program_node<convolution>& conv_node)
1537 auto conv_prim = conv_node.get_primitive();
1538 auto& input_node = conv_node.get_dependency(0);
1539 auto&& weights_layout = conv_node.weights(0).get_output_layout();
1540 auto&& input_layout = input_node.get_output_layout();
1542 std::shared_ptr<reorder> new_input = nullptr;
1544 if (input_node.type() == reorder::type_id()) //convolution's input is a reorder
1546 auto reorder_prim = input_node.as<reorder>().typed_desc();
1547 auto& reorder_input = input_node.get_dependency(0);
1548 auto reorder_layout = input_node.get_output_layout();
1549 reorder_layout.data_type = reorder_prim->output_data_type;
1550 new_input = lo.get_reorder(
1553 layout_optimizer::data_type::input,
1555 weights_layout).first;
1557 auto reorder_removed = false;
1558 if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal
1560 auto reorder_input_layout = reorder_input.get_output_layout();
1562 auto opt_layout = layout(new_input->output_data_type, new_input->output_format, reorder_input_layout.size);
1563 if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format
1565 if (reorder_prim->subtract_per_feature.empty() &&
1566 reorder_prim->mean.empty() &&
1567 !reorder_prim->output_padding) //just plain reorder
1569 conv_node.replace_dependency(0, reorder_input);
1570 if (input_node.get_users().size() == 0 && !input_node.is_output())
1572 reorder_removed = extract_and_remove(input_node);
1574 new_input = nullptr;
1576 else //change reorder's output layout
1578 reorder_prim->output_format = opt_layout.format;
1579 reorder_prim->output_data_type = opt_layout.data_type;
1580 new_input = nullptr;
1583 else //current reorder gives bad output, simply change it
1585 reorder_prim->output_format = opt_layout.format;
1586 reorder_prim->output_data_type = opt_layout.data_type;
1587 new_input = nullptr;
1591 if (!reorder_removed)
1592 input_node.recalc_output_layout();
1594 conv_node.recalc_output_layout();
1598 new_input = lo.get_reorder(
1599 input_node.get_output_layout(),
1601 layout_optimizer::data_type::input,
1603 weights_layout).first;
1606 if (new_input && new_input->output_format == format::winograd_2x3_s1_data)
1608 auto lower_size = (conv_prim->input_offset.negate() + input_layout.size);
1610 tensor upper_input_padding = tensor{ 0 };
1611 upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2; //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0
1612 upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8; //for y, y - 2 % 8 == 0 must hold
1614 apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() });
1616 auto winograd_output = std::make_shared<reorder>("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format,
1617 input_layout.data_type, std::vector<float>{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding);
1618 conv_node.output_layout.data_padding = padding{};
1619 auto& back_node = get_or_create(winograd_output);
1620 back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node);
1622 auto bias_term = conv_node.bias_term();
1623 //create additional eltwise node after reorder to compute bias
1626 auto& bias_node = conv_node.get_dependency(2);
1627 std::vector<primitive_id> inputs = { back_node.id(), bias_node.id() };
1628 auto winograd_output_biases = std::make_shared<eltwise>(back_node.id() + "_bias", inputs,
1629 cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope,
1630 back_node.output_layout.data_padding);
1631 back_node.output_layout.data_padding = padding{};
1632 auto& back_bias_node = get_or_create(winograd_output_biases);
1633 back_bias_node.processing_itr = processing_order.insert(std::next(back_node.processing_itr), &back_bias_node);
1634 replace_all_usages(back_node, back_bias_node);
1635 add_connection(back_node, back_bias_node);
1636 add_connection(bias_node, back_bias_node);
1637 conv_node.invalidate_users();
1638 replace_all_usages(conv_node, back_bias_node);
1641 if (conv_prim->with_activation)
1643 conv_node.typed_desc()->with_activation = false;
1645 back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope, 0.0f });
1650 conv_node.invalidate_users();
1651 replace_all_usages(conv_node, back_node);
1653 add_connection(conv_node, back_node);
1655 auto& r_node = get_or_create(new_input);
1656 r_node.as<reorder>().set_input_offset(conv_prim->input_offset);
1660 swap_names(conv_node, back_node);
1661 if (conv_node.is_output())
1663 conv_node.set_output(false);
1664 back_node.set_output(true);
1665 for (auto& output : outputs)
1667 if (output == &conv_node)
1669 output = &back_node;
1677 conv_node.remove_dependency(2);
1678 auto& back_bias_node = *nodes_map.find(back_node.id() + "_bias")->second;
1679 swap_names(conv_node, back_bias_node);
1680 if (conv_node.is_output())
1682 conv_node.set_output(false);
1683 back_bias_node.set_output(true);
1684 for (auto& output : outputs)
1686 if (output == &conv_node)
1688 output = &back_bias_node;
1696 if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf))
1698 auto conv1x1_output = std::make_shared<reorder>("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type);
1699 auto& back_node = get_or_create(conv1x1_output);
1700 back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node);
1702 conv_node.invalidate_users();
1703 replace_all_usages(conv_node, back_node);
1704 add_connection(conv_node, back_node);
1709 auto& r_node = get_or_create(new_input);
1710 add_intermediate(r_node, conv_node, 0, r_node.dependencies.empty());
1711 conv_node.recalc_output_layout();
1715 const auto reorder_input_detection_output = [this, &lo](typed_program_node<detection_output>& detection_output_node)
1717 auto detection_output_prim = detection_output_node.get_primitive();
1719 for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++)
1721 auto& input = detection_output_node.get_dependency(i);
1722 std::shared_ptr<reorder> new_input = lo.get_reorder(
1723 input.get_output_layout(),
1725 layout_optimizer::data_type::input,
1726 detection_output_node,
1727 layout{ data_types::f32, format::bfyx, tensor{} }).first;
1731 add_intermediate(new_input, detection_output_node, i);
1736 for (auto& prim : processing_order)
1738 //there's an assumption that only convolution will take data/input_layout as input
1739 //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above
1740 do_for_types<convolution, detection_output>(*prim,
1741 reorder_input, //case for convolution
1742 reorder_input_detection_output //case for detection-output
1747 //function which prepares given primitive for weights optimization
1748 template <typename T>
1749 void program_impl::optimize_bias(T& node, layout_optimizer& lo)
1751 layout output_layout = node.get_output_layout();
1753 size_t weights_offset = node.get_primitive()->input.size();
1754 size_t bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size();
1755 for (size_t i = bias_offset; i < node.get_dependencies().size(); ++i)
1757 //find weights primitive with given pimitive_id and add it to weights_optimizer
1758 const program_node& bias = node.get_dependency(i);
1759 const auto bias_type = layout_optimizer::data_type::bias;
1760 auto reorder = lo.get_reorder(
1761 bias.get_output_layout(),
1768 this->add_intermediate(reorder.first, node, i, !reorder.second);
1771 template void program_impl::optimize_bias<convolution_node>(convolution_node& node, layout_optimizer& lo);
1772 template void program_impl::optimize_bias<deconvolution_node>(deconvolution_node& node, layout_optimizer& lo);
1773 template void program_impl::optimize_bias<fully_connected_node>(fully_connected_node& node, layout_optimizer& lo);
1774 template void program_impl::optimize_bias<embed_node>(embed_node& node, layout_optimizer& lo);
1776 void program_impl::pre_optimize_bias(layout_optimizer& lo)
1778 for (auto& p : nodes_map)
1780 auto& prim = *p.second;
1781 if (prim.type() == convolution::type_id())
1783 if (!prim.as<convolution>().weights_quantization_term())
1784 optimize_bias(prim.as<convolution>(), lo);
1786 else if (prim.type() == deconvolution::type_id())
1788 optimize_bias(prim.as<deconvolution>(), lo);
1790 else if (prim.type() == fully_connected::type_id())
1792 if (!prim.as<fully_connected>().weights_quantization_term())
1793 optimize_bias(prim.as<fully_connected>(), lo);
1795 else if (prim.type() == embed::type_id())
1797 optimize_bias(prim.as<embed>(), lo);
1802 template <typename T>
1803 void program_impl::optimize_depthwise_sep_pre(T& node)
1805 //enable optimization only when IFM / split <= 8 (otherwise scheduling multiple opt kernels is better) and split >= 16
1806 if (!(node.get_dependency(0).get_output_layout().size.feature[0] / node.get_primitive()->split() <= 8) ||
1807 !(node.get_primitive()->split() >= 16))
1810 //make sure the weights and biases are data type and
1811 //are not reused in other primitives as they will be overriden with concatenated ones
1812 for (size_t i = 1; i < node.get_dependencies().size(); i++)
1814 auto& weights_or_biases = node.get_dependency(i);
1815 if (weights_or_biases.get_users().size() > 1 || weights_or_biases.type() != data::type_id())
1819 node.set_depthwise_sep_opt(true);
1821 template void program_impl::optimize_depthwise_sep_pre<convolution_node>(convolution_node& node);
1822 template void program_impl::optimize_depthwise_sep_pre<deconvolution_node>(deconvolution_node& node);
1824 void program_impl::prepare_depthwise_sep_opt()
1826 //depthiwise separated convolution/deconvolution optimization
1827 for (auto& p : nodes_map)
1829 auto& prim = *p.second;
1830 if (prim.type() == convolution::type_id())
1832 optimize_depthwise_sep_pre(prim.as<convolution>());
1834 else if (prim.type() == deconvolution::type_id())
1836 optimize_depthwise_sep_pre(prim.as<deconvolution>());
1841 void program_impl::handle_reshape()
1843 //reshape primitive by definition does not change underlying data, only shape description
1844 //however during graph initialization and data optimization the layouts can be changed without user's knowledge,
1845 //when reshape is followed by reorder, it is likely that reorder's output will not be as expected (for example reshape with flattened shape)
1846 //this pass resolved the issue by changing graph in the following way
1847 //- in case reshape has multiple users with reshape->reorder sequence, it will be splitted to multiple reshape primitives with single user
1848 //- in case of reshape->reorder sequence, the additional reorder before reshape will be added,
1849 // if last reorder does not contain padding or mean subtract, it will be removed later in the graph
1851 for (const auto& node : processing_order)
1853 if (node->is_type<reshape>())
1855 auto& input_node = node->get_dependency(0);
1857 if (input_node.is_type<reorder>())
1860 //vector for storing nodes that are reorder type, for which splitted primitives are needed (except for the first one where orginal reshape will be used)
1861 std::vector<program_node*> reorder_node_to_split;
1863 //find the users of reshape that are reorder type, if none present then skip the current node
1864 for (const auto& user : node->get_users())
1866 if (user->is_type<reorder>())
1867 reorder_node_to_split.push_back(user);
1870 if (!reorder_node_to_split.empty())
1872 auto& prim_node = node->as<reshape>();
1873 const auto& prim = prim_node.get_primitive();
1874 auto output_shape = prim->output_shape;
1876 //vector for storing reshape nodes to connect to new reorder nodes (if needed)
1877 std::vector<program_node*> reorder_reshape_nodes;
1879 bool skip_first_user = false;
1880 auto reshape_users = node->get_users();
1881 for (const auto& user : reshape_users)
1883 //reshape node for first user will be the orginal reshape from the graph
1884 if (!skip_first_user)
1886 if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
1887 reorder_reshape_nodes.push_back(node);
1888 skip_first_user = true;
1892 //other reshapes will be clones of the orginal one connected to reshape->reorder sequences
1893 if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
1895 auto new_reshape = std::make_shared<reshape>("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape);
1896 auto& new_reshape_node = get_or_create(new_reshape);
1897 add_connection(input_node, new_reshape_node);
1898 user->replace_dependency(0, new_reshape_node);
1899 new_reshape_node.processing_itr = processing_order.insert(std::next(input_node.processing_itr), &new_reshape_node);
1900 reorder_reshape_nodes.push_back(&new_reshape_node);
1904 //add new reorder nodes to proper reshape node
1905 auto reshape_reorder_id = 0;
1906 for (const auto& reorder_node : reorder_node_to_split)
1908 auto& reorder_reshape_node = reorder_reshape_nodes[reshape_reorder_id];
1909 auto reshape_in_layout = reorder_node->get_output_layout();
1910 auto reshape_input = std::make_shared<reorder>("_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(), input_node.id(), reshape_in_layout.format, reshape_in_layout.data_type);
1911 auto& reshape_input_node = get_or_create(reshape_input);
1912 add_intermediate(reshape_input_node, *reorder_reshape_node, 0, reshape_input_node.dependencies.empty());
1913 reshape_reorder_id++;
1917 auto reshape_layout = node->get_output_layout();
1918 if (!(node->is_output()) && (reshape_layout.format != cldnn::format::bfyx))
1920 auto bfyx_layout = layout({ reshape_layout.data_type, cldnn::format::bfyx, reshape_layout.size });
1921 //when some primitive does an implicit reorder to some other format then we lose the info about pitches in reshape stage
1922 //we assume user provides the input vector in bfyx
1923 if (!are_layouts_identical(reshape_layout, bfyx_layout).second)
1925 auto reshape_input = std::make_shared<reorder>("_reshape_input_" + node->id(), input_node.id(), cldnn::format::bfyx, reshape_layout.data_type);
1926 auto& reshape_input_node = get_or_create(reshape_input);
1927 add_intermediate(reshape_input_node, *node, 0, reshape_input_node.dependencies.empty());
1929 auto reshape_users = node->get_users();
1930 for (const auto& user : reshape_users)
1933 for (size_t i = 0; i < user->get_dependencies().size(); i++)
1935 auto& input = user->get_dependency(i);
1936 if (input.id() == node->id()) {
1941 auto reshape_output = std::make_shared<reorder>("_reshape_output_" + node->id(), user->id(), reshape_layout.format, reshape_layout.data_type);
1942 auto& reshape_output_node = get_or_create(reshape_output);
1943 add_intermediate(reshape_output_node, *user, idx, reshape_output_node.dependencies.empty());
1951 //function which prepares given primitive for weights optimization
1952 template <typename T>
1953 void program_impl::optimize_weights(T& node, layout_optimizer& lo)
1955 auto weights_offset = node.get_primitive()->input.size();
1956 auto bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size();
1957 for (auto i = weights_offset; i < bias_offset; i++)
1959 auto& weights = node.get_dependency(i);
1960 auto* impl = node.get_selected_impl().get();
1961 auto output_layout = node.get_output_layout();
1962 auto& weights_node = node.get_dependency(1);
1963 auto weights_layout = weights_node.get_output_layout();
1964 const auto weights_type = layout_optimizer::data_type::weights;
1966 auto reorders = lo.get_generic_layer(
1967 impl->_weights_reorder_params,
1972 for (auto& reorder : reorders)
1974 //insert new generic_layer node to topology
1975 this->add_intermediate(reorder.first, node, i, !reorder.second);
1976 //set generic_layer's node output layout and implementation
1977 auto& g_node = node.get_dependency(i);
1978 g_node.get_output_layout(false);
1979 g_node.selected_impl = g_node.type()->choose_impl(*engine, g_node);
1981 //set the old output layout and do not invalidate users as change of weights will not affect output layout
1982 node.set_output_layout(output_layout, false);
1985 template void program_impl::optimize_weights<convolution_node>(convolution_node& node, layout_optimizer& lo);
1986 template void program_impl::optimize_weights<deconvolution_node>(deconvolution_node& node, layout_optimizer& lo);
1987 template void program_impl::optimize_weights<fully_connected_node>(fully_connected_node& node, layout_optimizer& lo);
1989 void program_impl::post_optimize_weights(layout_optimizer& lo)
1991 for (auto& p : nodes_map)
1993 auto& prim = *p.second;
1994 if (prim.type() == convolution::type_id())
1996 optimize_weights(prim.as<convolution>(), lo);
1998 else if (prim.type() == deconvolution::type_id())
2000 optimize_weights(prim.as<deconvolution>(), lo);
2002 else if (prim.type() == fully_connected::type_id())
2004 optimize_weights(prim.as<fully_connected>(), lo);
2006 //else if (prim.type() == lstm_gemm::type_id())//TODO: Enable postoptimize weights for lstm
2008 // prep_opt(prim.as<lstm_gemm>()); //we should take care of weights and reccurent
2013 template <typename T>
2014 void program_impl::optimize_depthwise_sep_post(T& node)
2016 if (!node.get_depthwise_sep_opt())
2019 const auto& split = node.get_primitive()->split();
2021 auto dependency_offset = node.get_primitive()->input.size();
2022 //concatenate weights
2024 //if weights were optimized it is needed to use the sizes after optimization
2025 auto target_layout = get_weights_layout(node.get_dependency(dependency_offset), split);
2026 merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2027 dependency_offset++;
2030 //concatenate biases
2031 if (node.get_primitive()->bias.size() != 0)
2033 const auto& bias_layout = node.get_dependency(dependency_offset).get_output_layout();
2034 auto target_layout = layout(bias_layout.data_type, cldnn::format::bfyx, { 1, 1, bias_layout.size.spatial[0] * split, 1 });
2035 merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2036 dependency_offset++;
2039 if (node.template is_type<convolution>())
2041 auto& prim_node = node.template as<convolution>();
2042 const auto& prim = prim_node.get_primitive();
2044 // concatenate weights quantization factors
2045 if (prim->weights_quantization_factors.size() != 0)
2047 const auto& weights_quantization_layout = node.get_dependency(dependency_offset).get_output_layout();
2048 auto target_layout = layout(weights_quantization_layout.data_type, cldnn::format::bfyx, { 1, 1, weights_quantization_layout.size.batch[0] * split, 1 });
2049 merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2050 dependency_offset++;
2052 // concatenate output callibration factors
2053 if (prim->output_calibration_factors.size() != 0)
2055 const auto& output_callibration_layout = node.get_dependency(dependency_offset).get_output_layout();
2056 auto target_layout = layout(output_callibration_layout.data_type, cldnn::format::bfyx, { 1, 1, output_callibration_layout.size.batch[0] * split, 1 });
2057 merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2058 dependency_offset++;
2062 if (node.get_primitive())
2063 //override node split, as only one kernel will be executed
2066 template void program_impl::optimize_depthwise_sep_post<convolution_node>(convolution_node& node);
2067 template void program_impl::optimize_depthwise_sep_post<deconvolution_node>(deconvolution_node& node);
2069 void program_impl::prep_opt_depthwise_sep_post()
2071 //depthiwise separated convolution/deconvolution optimization
2072 for (auto& p : nodes_map)
2074 auto& prim = *p.second;
2075 if (prim.type() == convolution::type_id())
2077 optimize_depthwise_sep_post(prim.as<convolution>());
2079 else if (prim.type() == deconvolution::type_id())
2081 optimize_depthwise_sep_post(prim.as<deconvolution>());
2086 void program_impl::apply_needed_padding(program_node& node, program_node& prev_node,
2087 const padding& needed_padding)
2089 auto target_layout = prev_node.get_output_layout();
2091 // Short circuit if padding did not change.
2092 if (target_layout.data_padding == needed_padding)
2095 // Special handling for input nodes.
2096 if (prev_node.is_type<input_layout>() || prev_node.is_type<mutable_data>())
2098 target_layout.data_padding = needed_padding;
2100 auto r_prim = std::make_shared<reorder>("reorder_input_" + node.id(), prev_node.id(), target_layout);
2101 add_intermediate(r_prim, node, 0);
2105 prev_node.merge_output_padding(needed_padding);
2108 void program_impl::prepare_padding()
2110 if (output_size_handling_enabled)
2112 // Prepare upper padding for primitives that support output_size parameter.
2113 for (const auto& node : processing_order)
2115 if (node->is_type<convolution>())
2117 auto& prim_node = node->as<convolution>();
2118 const auto& prim = prim_node.get_primitive();
2120 if (!prim->with_output_size)
2123 auto filter_size = prim_node.weights(0).get_output_layout().size;
2125 auto needed_padding = calc_sliding_window_needed_input_padding(
2126 prim_node.input().get_output_layout(),
2127 prim->output_size, filter_size, prim->input_offset, prim->stride, prim->dilation, false, 1);
2128 apply_needed_padding(prim_node, prim_node.input(), needed_padding);
2130 else if (node->is_type<deconvolution>())
2132 auto& prim_node = node->as<deconvolution>();
2133 const auto& prim = prim_node.get_primitive();
2135 if (!prim->with_output_size)
2138 auto filter_size = prim_node.weights(0).get_output_layout().size;
2140 auto needed_padding = calc_sliding_window_needed_input_padding(
2141 prim_node.input().get_output_layout(),
2142 prim->output_size, filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
2144 apply_needed_padding(prim_node, prim_node.input(), needed_padding);
2146 else if (node->is_type<pooling>())
2148 auto& prim_node = node->as<pooling>();
2149 const auto& prim = prim_node.get_primitive();
2151 if (!prim->with_output_size)
2154 // NOTE: Currently there is no pooling implementation/pooling mode which does not check input data range.
2155 // There is no need to add padding requirements on pooling inputs.
2156 //auto needed_padding = calc_sliding_window_needed_input_padding(
2157 // prim_node.input().get_output_layout(),
2158 // prim->output_size, prim->size, prim->input_offset, prim->stride, {1, 1, 1, 1}, false, 1);
2159 auto needed_padding = prim_node.input().get_output_layout().data_padding;
2161 apply_needed_padding(prim_node, prim_node.input(), needed_padding);
2166 // Prepare optimized padding for bfyx convolution.
2167 for (auto& pair : nodes_map)
2169 if (pair.second->type() != convolution::type_id())
2172 auto& node = pair.second->as<convolution>();
2173 if (node.get_dependencies().empty())
2176 auto conv = node.get_primitive();
2177 auto& conv_input_node = node.get_dependency(0);
2178 auto conv_layout = node.get_output_layout();
2180 // right now output padding optimization is only available for bfyx format and data type = float32
2181 if (conv_layout.format != cldnn::format::bfyx
2182 && conv_layout.format != cldnn::format::bf8_xy16
2183 && conv_layout.format != cldnn::format::byxf_af32
2184 && conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32)
2189 // Calculating input padding needed for convolution
2190 auto& filter_node = node.as<convolution>().weights(0);
2191 auto filter_prim = filter_node.get_primitive();
2193 layout filter_layout = filter_node.get_output_layout();
2195 // convolution have only one input primitive
2196 auto prev_prim_output_layout = conv_input_node.get_output_layout();
2198 // Compute initial required paddings for primitive used as input for convolution.
2199 auto input_offset = conv->input_offset;
2200 auto stride = conv->stride;
2201 auto dilation = conv->dilation;
2203 auto input_limit_x = input_offset.spatial[0] + (conv_layout.size.spatial[0] - 1) * stride.spatial[0] + (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
2204 auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] + (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
2206 auto left_padding = std::max(-input_offset.spatial[0], 0);
2207 auto top_padding = std::max(-input_offset.spatial[1], 0);
2208 auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
2209 auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
2211 // Adjust right padding, so entire buffer size in X dimension is properly aligned.
2212 // TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
2213 //auto needed_buffer_size_x = static_cast<cldnn::tensor::value_type>(
2214 // round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
2215 //right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
2217 cldnn::padding needed_padding({ 0, 0, left_padding, top_padding }, { 0, 0, right_padding, bottom_padding }, 0);
2218 needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
2220 apply_needed_padding(node, conv_input_node, needed_padding);
2224 void program_impl::propagate_constants()
2226 constants_propagator prop(this);
2228 for (auto& node : processing_order)
2229 prop.visit_node(*node);
2231 auto&& to_replace = prop.calculate();
2233 //remove all nodes which are no longer relevant, i.e. nodes which:
2234 // 1. are constants, and
2235 // 2. do not have non-const user (so their data are not used during inference), and
2236 // 3. are not marked as outputs.
2237 // in case if node has either non-const user or is marked as output, it should be replace with cldnn::data rather than removed (see next loop)
2238 auto proc_itr = processing_order.begin();
2239 while (proc_itr != processing_order.end())
2241 auto& node = (*proc_itr++);
2242 if (!node->is_constant())
2244 if (node->has_non_const_user() || (node->is_output() && !node->is_type<data>()))
2247 auto& users = node->users;
2248 auto& deps = node->dependencies;
2250 for (size_t idx = 0; idx < deps.size(); idx++)
2252 deps.at(idx)->users.remove(node);
2256 for (auto& usr : users) {
2257 auto& usr_deps = usr->dependencies;
2258 usr_deps.erase(std::remove(usr_deps.begin(), usr_deps.end(), node), usr_deps.end());
2262 if (!node->is_output())
2264 auto rem = remove_if_dangling(*node);
2265 assert(rem && "Non-output constant node which has only constant users should have been removed during constants propagation pass");
2270 //replace all constant nodes which are relevant for inference (either used by non-const user or marked as output) with recomputed cldnn::data
2271 for (auto& cout : to_replace)
2273 auto& id_to_replace = cout.first;
2275 //TODO: do not use API primitives internally and get rid of this last 'cldnn::memory' internal usage
2276 memory api_memory = details::memory_c_to_cpp_converter::convert(api_cast(cout.second.get()));
2277 //c-cpp converter does not retain since normally it is done inside API-impl layer (cldnn.cpp) so we need to do it manually
2278 cout.second->add_ref();
2280 auto const_data = std::make_shared<data>("_cldnn_const_prop_" + id_to_replace, api_memory /* <<< REMOVE ME WHEN POSSIBLE */);
2281 auto& new_node = get_or_create(const_data);
2282 auto& curr_node = *nodes_map.at(id_to_replace);
2284 if (!curr_node.is_type<generic_layer>())
2286 auto curr_node_deps = curr_node.get_dependencies();
2287 for (auto& dep : curr_node_deps)
2289 auto dep_users = dep->get_users();
2290 for (auto& dep_user : dep_users)
2292 if (dep_user == &curr_node)
2293 remove_connection(*dep, curr_node);
2298 curr_node.dependencies.clear();
2299 //remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any dependencies)
2300 curr_node.users.erase(
2301 std::remove_if(curr_node.users.begin(), curr_node.users.end(), [](program_node* node) { return node->is_constant(); }),
2302 curr_node.users.end()
2304 replace(curr_node, new_node, false, false);
2308 void program_impl::prepare_buffer_fusing()
2310 bool is_debug = options.get<build_option_type::debug>()->enabled();
2311 auto itr = processing_order.begin();
2312 while (itr != processing_order.end())
2314 auto& node = (*itr++);
2316 // TODO: Move fused activation to previous layer when possible
2317 if (node->fused_activation.activation_func != cldnn_activation_func_t::activation_none)
2320 do_for_types<concatenation>(*node, [this, is_debug](concatenation_node& node)
2322 // buffer fusing should not be performed if one of inputs produces padded output since
2323 // it could break desired memory alignment. On the other hand, if this node uses all inputs
2324 // exclusively (see check above) they should not have output padding set since concatenation
2325 // does not ask for any.
2326 if (node.has_padded_dependency())
2329 auto concat_axis = node.get_primitive()->axis;
2330 auto padd = node.get_output_layout().data_padding;
2332 tensor lower_padd = padd.lower_size();
2333 tensor upper_padd = padd.upper_size();
2335 auto upper_padd_val = node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis];
2336 tensor lower_padd_offset = lower_padd;
2338 std::list<std::pair<const std::vector<program_node*>, tensor>> stack = { std::make_pair(node.get_dependencies(), tensor{ 0, 0, 0, 0 }) };
2339 while (!stack.empty())
2341 auto nodes_list = stack.front();
2344 auto cascade_adjustment = nodes_list.second;
2345 upper_padd.raw[concat_axis] = upper_padd_val;
2346 lower_padd = lower_padd_offset;
2348 //check if concatenation in place can be applied for inputs set
2349 for (auto input : nodes_list.first)
2351 //if any of this node's inputs is used by more than one primitive and is not optimized concatenation then do not fuse buffers,
2352 //also, if an input is marked as network output, prevent optimizations which would affect a form of its output (unless debug flag is set)
2353 // todo: in future, if this case is problem, it can be optimized further to enable buffer fusing
2354 // per single input rather than all/none
2355 // + restrict input types to pooling, convolution and activation only due to problems with output padding on b and f
2356 if ((!input->is_type<pooling>() && !input->is_type<convolution>() && !input->is_type<activation>() && !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>()) ||
2357 (input->is_output() && !is_debug) ||
2358 input->get_users().size() > 2)
2361 if (input->get_users().size() > 1)
2363 auto user_count = input->get_users().size();
2364 for (auto& user : input->get_users())
2365 if (user->is_type<concatenation>())
2371 //check only for spatial paddings. Accept feature and batch
2372 if (input->get_output_layout().data_padding.lower_size().spatial[0] != 0 ||
2373 input->get_output_layout().data_padding.upper_size().spatial[0] != 0 ||
2374 input->get_output_layout().data_padding.lower_size().spatial[1] != 0 ||
2375 input->get_output_layout().data_padding.upper_size().spatial[1] != 0)
2379 //apply concatenation in place optimization
2380 for (auto input : nodes_list.first)
2382 auto input_lenght = input->get_output_layout().size.raw[concat_axis];
2384 // shrink upper pad so it points at the end of the input's buffer
2386 // |--- lower padd ---| |---------- upper padd -----------|
2387 // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
2388 upper_padd.raw[concat_axis] -= input_lenght;
2390 //adjust padding sizes for cascade concatenations
2391 auto lower_padd_tmp = lower_padd;
2392 lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis];
2393 auto upper_padd_tmp = upper_padd;
2394 upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis];
2396 // set new padding for input
2397 input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes()));
2399 // move lower padd further
2401 // |-------------- lower padd -------------|---------- upper padd -----------|
2402 // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
2404 lower_padd.raw[concat_axis] += input_lenght;
2406 if (input->type() == concatenation::type_id() && input->can_be_optimized())
2408 if (input->as<concatenation>().get_primitive()->axis != node.get_primitive()->axis)
2411 if (!input->get_dependencies().empty())
2412 stack.push_back(std::make_pair(input->get_dependencies(), input->get_output_layout().data_padding.lower_size()));
2417 node.can_be_optimized(true);
2421 do_for_types<crop>(*node, [this, is_debug](crop_node& node)
2423 //if the node is marked as network output, prevent optimizations which would affect a form of its output, unless debug flag is set
2424 if (node.is_output() && !is_debug)
2427 //do not optimize when next node is concatenation which is not output
2428 if (node.get_users().size() == 1 && node.get_users().front()->is_type<concatenation>() && !node.get_users().front()->is_output())
2431 if (node.get_dependencies().size() == 1 &&
2432 node.get_users().size() > 0)
2434 // optimization is avaiable for croping across depth(features) only
2435 // if output padding has defined padding accross featuers already it wouldn't
2436 // work because it expect to have zeros in the padded area.
2437 auto format = node.get_output_layout().format;
2438 auto crop_prim = node.get_primitive();
2439 auto input_layout = node.get_dependency(0).get_output_layout();
2440 auto out_padd = node.get_output_layout().data_padding;
2441 if (format == format::bfyx &&
2442 crop_prim->reference_input.batch[0] == input_layout.size.batch[0] &&
2443 crop_prim->reference_input.spatial[0] == input_layout.size.spatial[0] &&
2444 crop_prim->reference_input.spatial[1] == input_layout.size.spatial[1] &&
2445 out_padd.lower_size().feature[0] == 0 &&
2446 out_padd.upper_size().feature[0] == 0 &&
2447 out_padd.lower_size().batch[0] == 0 &&
2448 out_padd.upper_size().batch[0] == 0 &&
2449 out_padd.lower_size().spatial[0] == 0 &&
2450 out_padd.lower_size().spatial[1] == 0 &&
2451 out_padd.upper_size().spatial[0] == 0 &&
2452 out_padd.upper_size().spatial[1] == 0)
2455 // crop input buffer
2456 // |___________data____________|
2458 // crop output buffer
2459 // |-------->| offsets[f] |<--|
2465 // crop output buffer
2466 // |_low_pad_|__data_size__|___|<-upper pad
2468 node.set_output_padding(padding(
2469 { out_padd.lower_size().batch[0], crop_prim->offsets.feature[0], out_padd.lower_size().spatial[0], out_padd.lower_size().spatial[1] },
2470 { out_padd.upper_size().batch[0], input_layout.size.feature[0] - crop_prim->offsets.feature[0] - crop_prim->reference_input.feature[0],
2471 out_padd.upper_size().spatial[0], out_padd.upper_size().spatial[1] }));
2472 node.can_be_optimized(true);
2477 do_for_types<reshape>(*node, [this](reshape_node& node)
2479 node.get_output_layout();
2480 if (node.is_in_place() && node.get_fused_activation_func() == activation_none)
2481 node.can_be_optimized(true);
2483 do_for_types<reorder>(*node, [this](reorder_node& node)
2485 auto& input = node.input();
2486 auto output_layout = node.get_output_layout();
2487 //This is WA for topologies that due to additional reorders added perform worse with conv1x1 optimization
2488 auto remove_bf8_xy_opt = ((input.is_type<pooling>() || input.is_type<concatenation>()) &&
2489 output_layout.format == format::bf8_xy16 && input.get_users().size() == 1);
2490 //Remove reorder from convolution 1x1 to bfyx in some conditions
2491 auto remove_byxf_opt = (input.is_type<convolution>() &&
2492 input.get_users().size() == 1 &&
2493 input.get_output_layout().format == format::byxf);
2494 //check if all inputs user have the same format
2495 auto all_users_same_format = true;
2496 auto input_user_layout_format = input.get_users().front()->get_output_layout().format;
2497 for (auto const& user : input.get_users())
2499 if (user->get_output_layout().format != input_user_layout_format)
2501 all_users_same_format = false;
2505 auto same_data_type = input.get_output_layout().data_type == output_layout.data_type;
2506 //Optimization only available in case of layers that support different input and output formats.
2507 //todo: new api needs to be created to read such caps
2508 if (!(input.is_type<pooling>() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) &&
2509 !remove_bf8_xy_opt &&
2510 !(input.is_type<convolution>() && input.get_output_layout().format == format::bf8_xy16) &&
2511 !(input.is_type<eltwise>() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) &&
2512 !(remove_byxf_opt && (node.get_users().front()->is_type<eltwise>() || node.get_users().front()->is_type<pooling>())))
2515 if (remove_bf8_xy_opt)
2517 auto users_user_layout = node.get_users().front()->get_users().front()->get_output_layout();
2518 auto input_layout = input.get_output_layout();
2519 auto target_layout = layout(input_layout.data_type, users_user_layout.format, input_layout.size, input_layout.data_padding);
2520 input.set_output_layout(target_layout, false);
2522 else if (remove_byxf_opt)
2524 auto user = node.get_users().front();
2525 auto users_users = node.get_users().front()->get_users();
2527 for (auto const& users_user : users_users)
2529 if (users_user->get_output_layout().format != format::byxf && !users_user->is_type<eltwise>())
2531 remove_byxf_opt = false;
2536 if (remove_byxf_opt)
2538 auto input_layout = input.get_output_layout();
2539 user->set_output_layout(input_layout, false);
2543 input.set_output_layout(output_layout, false);
2545 node.can_be_optimized(true);
2546 extract_and_remove(node); //try to remove redundant reorders
2551 void program_impl::fuse_skip_layers(program_node* node)
2553 do_for_types<eltwise>(*node, [this](eltwise_node& node)
2555 bool skippable = false;
2557 if (node.get_primitive()->mode != eltwise_mode::sum || node.inputs_count() != 2)
2560 if (node.input(0).is_type<deconvolution>())
2564 else if (node.input(1).is_type<deconvolution>())
2573 auto& to_fuse_with = node.input(index).as<deconvolution>();
2574 int to_fuse_index = index == 0 ? 1 : 0;
2576 // check that node doesn't have fused eltwise already
2577 if (to_fuse_with.has_fused_sum())
2580 //remove dependencies and users of elwtise that is going to be extracted
2581 add_connection(node.input(to_fuse_index), to_fuse_with);
2582 remove_connection(node.input(to_fuse_index), node);
2584 //replace processing_num of the node where fusing take place and eltwise
2585 auto new_processing_num = node.processing_num;
2586 processing_order.erase(to_fuse_with.processing_itr);
2587 to_fuse_with.processing_itr = processing_order.insert(node.processing_itr, &to_fuse_with);
2588 to_fuse_with.processing_num = new_processing_num;
2590 //make sure that new fused node's users have higher processing_num than fused node
2591 for (auto user : to_fuse_with.get_users())
2593 if (user->processing_num < new_processing_num)
2595 processing_order.erase(user->processing_itr);
2596 user->processing_itr = processing_order.insert(std::next(to_fuse_with.processing_itr), user);
2597 user->processing_num = new_processing_num + 1;
2601 if (node.get_fused_activation_func() != activation_none)
2602 to_fuse_with.set_fused_activation(node.get_fused_activation_func(), node.get_fused_activation_params());
2603 to_fuse_with.set_output_padding(node.get_output_layout().data_padding);
2605 extract_and_remove(node);
2609 void program_impl::prepare_primitive_fusing()
2611 bool is_debug = options.get<build_option_type::debug>()->enabled();
2613 auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed
2614 while (itr != processing_order.end())
2616 auto node_itr = itr++;
2617 auto& node = (*node_itr);
2619 do_for_types<activation>(*node, [this, is_debug](activation_node& node)
2622 auto& input = node.input();
2625 // - inputs cannot be padded
2626 // - primitives input cannot be output
2627 // - no activation additional input
2628 // - input was optimized
2629 if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() ||
2630 node.get_dependencies().size() != 1 || input.can_be_optimized())
2633 // - check if there is no activation fused already
2634 // - limit to primitives which implementations support activation fusing
2635 if (input.get_users().size() != 1 || input.get_fused_activation_func() != activation_none ||
2636 //TODO: new api needs to be created to read such caps
2637 //right now use whitelist so no new primitives will be affected in case of lack of fused activation support
2638 (!input.is_type<batch_norm>() && !input.is_type<concatenation>() && !input.is_type<convolution>() &&
2639 !input.is_type<crop>() && !input.is_type<deconvolution>() && !input.is_type<eltwise>() &&
2640 !input.is_type<fully_connected>() && !input.is_type<lrn>() && !input.is_type<normalize>() &&
2641 !input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
2642 !input.is_type<roi_pooling>() && !input.is_type<scale>() &&
2643 !input.is_type<softmax>() && !input.is_type<upsampling>() && !input.is_type<mvn>()))
2646 input.set_fused_activation(node.get_primitive()->activation_func, node.get_primitive()->additional_params);
2647 input.set_output_padding(node.get_output_layout().data_padding);
2649 extract_and_remove(node);
2653 //Second loop tries fusing several reorders one by one (if present) into one reorder
2654 itr = processing_order.begin();
2655 while (itr != processing_order.end())
2657 auto node_itr = itr++;
2658 auto& node = (*node_itr);
2660 do_for_types<reorder>(*node, [this, is_debug](reorder_node& node)
2662 auto& input = node.input();
2665 // - inputs cannot be padded
2666 // - primitives input cannot be output
2667 // - input was optimized
2668 if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.get_dependencies().size() != 1 ||
2669 input.can_be_optimized())
2672 // - check if previous node is reorder with 1 user
2673 // - do not fuse if current node has mean subtract
2674 if (input.get_users().size() != 1 || !input.is_type<reorder>() ||
2675 node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
2678 input.set_output_layout(node.get_output_layout(), false);
2679 extract_and_remove(node);
2682 //Third loop tries fusing eltwise (sum) with deconvolution
2683 itr = processing_order.begin();
2684 while (itr != processing_order.end())
2686 auto node_itr = itr++;
2687 auto& node = (*node_itr);
2689 fuse_skip_layers(node);
2693 program_node& program_impl::get_or_create(std::shared_ptr<primitive> prim)
2695 auto itr = nodes_map.lower_bound(prim->id);
2696 if (itr != nodes_map.end() && itr->first == prim->id)
2697 return *itr->second;
2699 auto new_node = prim->type->create_node(*this, prim);
2700 new_node->set_org_primitive_id(new_node->id());
2701 nodes_map.insert(itr, { prim->id, new_node });
2705 void program_impl::add_intermediate(program_node& node, program_node& next, size_t prev_idx, bool connect_int_node_with_old_dep)
2707 if (connect_int_node_with_old_dep && !node.dependencies.empty())
2708 throw std::invalid_argument("Node which is about to be added inbetween two other nodes should not have any existing dependencies");
2710 auto& prev = next.get_dependency(prev_idx);
2711 //firstly add connection, later replace dependency, so 'prev' won't become dangling and therefore removed
2712 if (connect_int_node_with_old_dep)
2714 add_connection(prev, node);
2715 if (node.processing_itr != processing_order.end())
2716 processing_order.erase(node.processing_itr);
2718 auto itr = prev.processing_itr;
2719 node.processing_itr = processing_order.insert(++itr, &node);
2720 node.processing_num = prev.processing_num;
2723 next.replace_dependency(prev_idx, node);
2724 node.constant = prev.constant;
2725 node.data_flow = prev.data_flow;
2726 if (prev.constant_frontier)
2728 node.constant_frontier = true;
2729 prev.constant_frontier = false;
2733 void program_impl::rename(program_node & node, primitive_id const & new_id)
2735 if (nodes_map.count(new_id))
2736 throw std::runtime_error("Trying to rename program_node but node with id " + new_id + " already exists");
2737 if (node.is_output())
2738 throw std::invalid_argument("Trying to rename an output node. If you intend to do that, please clear 'output' flag manually.");
2740 auto node_ptr = nodes_map.find(node.id())->second;
2741 nodes_map.emplace(new_id, node_ptr);
2742 nodes_map.erase(node.id());
2744 if (!node.is_type<internal_primitive>())
2745 const_cast<primitive_id&>(node.desc->id) = new_id;
2747 reinterpret_cast<details::internal_program_node_base&>(node).internal_id = new_id;
2750 void program_impl::swap_names(program_node& node1, program_node& node2)
2752 const auto _extract_id = [](program_node& node) -> primitive_id&
2754 if (!node.is_type<internal_primitive>())
2755 return const_cast<primitive_id&>(node.desc->id);
2757 return reinterpret_cast<details::internal_program_node_base&>(node).internal_id;
2760 nodes_map.at(node1.id()).swap(nodes_map.at(node2.id()));
2761 std::swap(_extract_id(node1), _extract_id(node2));
2764 void program_impl::replace_all_usages(program_node & old_node, program_node & new_node)
2766 auto itr = old_node.users.begin();
2767 bool end = (itr == old_node.users.end());
2770 auto& usage = (*itr++);
2771 end = (itr == old_node.users.end());
2772 usage->replace_dependency(old_node, new_node);
2776 void program_impl::replace(program_node& old_node, program_node& new_node, bool replace_whole_branch, bool check_output_layouts_integrity)
2778 if ((!new_node.dependencies.empty() && !replace_whole_branch) || !new_node.users.empty())
2779 throw std::invalid_argument("Node which is about to replace other node should be detached");
2781 if (new_node.is_output())
2782 throw std::invalid_argument("Replacement node shouldn't be marked as an output since it's impossible to rename such node.");
2784 auto id = old_node.id();
2785 new_node.output_layout = old_node.get_output_layout();
2786 new_node.valid_output_layout = old_node.valid_output_layout;
2788 if (!replace_whole_branch)
2790 //copy old's dependencies
2791 while (!old_node.dependencies.empty())
2793 auto& dep = old_node.dependencies.back();
2794 add_connection(*dep, new_node);
2795 remove_connection(*dep, old_node);
2800 for (auto& user : old_node.users)
2802 new_node.users.push_back(user);
2803 for (auto& users_dep : user->dependencies)
2805 if (users_dep == &old_node)
2807 users_dep = &new_node;
2813 old_node.users.clear();
2815 if (check_output_layouts_integrity && new_node.valid_output_layout)
2816 new_node.recalc_output_layout();
2818 bool old_was_output = false;
2820 if (old_node.is_output())
2822 old_was_output = true;
2823 old_node.set_output(false);
2824 outputs.erase(std::remove(outputs.begin(), outputs.end(), &old_node), outputs.end());
2826 if (new_node.is_input())
2827 inputs.push_back(&new_node);
2828 if (old_node.is_input())
2829 inputs.remove(&old_node);
2831 new_node.constant = old_node.constant;
2832 new_node.constant_frontier = old_node.constant_frontier;
2833 new_node.user_mark = old_node.user_mark;
2835 auto old_news_pos = new_node.processing_itr;
2836 new_node.processing_itr = processing_order.insert(old_node.processing_itr, &new_node);
2837 new_node.processing_num = old_node.processing_num;
2838 if (old_news_pos != processing_order.end())
2839 processing_order.erase(old_news_pos);
2840 if (old_node.processing_itr != processing_order.end())
2841 processing_order.erase(old_node.processing_itr);
2843 nodes_map.erase(id);
2844 rename(new_node, id);
2846 //mark new node as an output after renaming
2849 new_node.set_output(true);
2850 outputs.push_back(&new_node);
2854 bool program_impl::remove_if_dangling(program_node& node, bool detach_whole_branch)
2856 if (!node.users.empty())
2858 if (!detach_whole_branch && !node.dependencies.empty())
2861 std::list<program_node*> to_remove;
2862 std::list<program_node*> marked;
2863 if (detach_whole_branch)
2866 std::list<program_node*> queue = { &node };
2867 while (!queue.empty())
2869 auto curr = queue.front();
2871 marked.push_back(curr);
2873 //remove only if all users also has been marked
2874 bool rem = !std::any_of(curr->get_users().begin(), curr->get_users().end(), [](program_node* node) { return !node->is_marked(); });
2876 to_remove.push_back(curr);
2878 for (auto dep : curr->get_dependencies())
2880 if (!dep->is_marked())
2883 queue.push_back(dep);
2889 to_remove.push_back(&node);
2891 for (auto n : marked)
2894 for (auto rem : to_remove)
2896 if (!rem->is_output() || is_debug_build())
2898 if (detach_whole_branch)
2900 for (auto& user : rem->get_users())
2901 user->remove_dependency(*rem);
2903 if (rem->is_input())
2906 if (std::find(processing_order.begin(), processing_order.end(), rem) != processing_order.end())
2907 processing_order.erase(rem->processing_itr);
2908 optimized_out.push_back(rem->id());
2909 nodes_map.erase(rem->id());
2916 bool program_impl::extract_and_remove(program_node& node)
2918 if (node.get_dependencies().size() != 1)
2921 if (node.is_output() && node.get_dependency(0).is_output() && !is_debug_build()) //TODO: add a mechanism to support removal of nodes which are marked as outputs
2924 if (node.is_output() && !is_debug_build())
2926 auto& prev = node.get_dependency(0);
2927 auto node_id = node.id();
2929 node.set_output(false);
2930 outputs.erase(std::remove(outputs.begin(), outputs.end(), &node), outputs.end());
2932 rename(node, "_cldnn_tmp_" + node_id);
2933 rename(prev, node_id);
2935 prev.set_output(true);
2936 outputs.push_back(&prev);
2939 auto& input = node.get_dependency(0);
2940 node.dependencies.clear();
2941 input.users.remove(&node);
2943 if (node.constant_frontier)
2945 assert(node.constant && "Constant frontier should also, by definition, be constant");
2946 assert(input.constant && "Input for constant forontier should, by definition, be constant");
2947 input.constant_frontier = true;
2950 if (!node.is_endpoint())
2951 replace_all_usages(node, input);
2953 remove_if_dangling(node);
2958 void program_impl::replace_data_with_optimized(std::map<primitive_id, memory_impl::ptr> const & replace_map)
2960 for (auto& result : replace_map)
2962 auto& node = *nodes_map.at(result.first);
2963 assert(node.is_type<data>() && "Optimized primitive is not a cldnn::data");
2964 assert(result.second != nullptr && "Memory which handles result of optimization should not be nullptr");
2965 node.as<data>().attach_memory(*result.second, false);
2969 void program_impl::dump_memory_pool() const
2971 if (!get_engine().configuration().enable_memory_pool)
2973 auto path = get_dir_path(options);
2979 path += "cldnn_memory_pool.log";
2980 auto dep = get_memory_dependencies_string();
2981 get_engine().dump_memory_pool(*this, path, dep);
2982 dump_program("14_memory_pool", true);
2985 //TODO: break this function into number of smaller ones + add per-primitive fields (possibly use primitive_inst::to_string?)
2986 void program_impl::dump_program(const char* stage, bool with_full_info, std::function<bool(program_node const&)> const& filter) const
2988 auto path = get_dir_path(options);
2994 std::ofstream graph(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".graph");
2995 dump_graph_init(graph, *this, filter);
2997 if (!with_full_info)
3002 graph.open(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".info");
3003 dump_graph_info(graph, *this, filter);
3005 graph.open(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".order");
3006 dump_graph_processing_order(graph, *this);
3008 graph.open(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".optimized");
3009 dump_graph_optimized(graph, *this);
3012 //Dumps weights and biasses in serialization process, not working yet, in progress.
3013 void program_impl::dump_weights_and_biasses(std::vector<unsigned long long>& offsets, std::vector<std::string>& data_names, std::ofstream& file_stream) const
3015 for (auto const& n : nodes_map)
3017 auto dependency_count = (unsigned int)n.second.get()->get_dependencies().size();
3018 for (unsigned int dp = 0; dp < dependency_count; dp++)
3020 auto& dependency = n.second.get()->get_dependency(dp);
3021 if (dependency.is_type<data>())
3023 offsets.push_back(offsets.empty() ? 0ull : offsets.back());
3024 auto& mem = dependency.as<data>().get_attached_memory();
3025 if (mem.get_layout().data_type == data_types::f32)
3026 dump_data(mem, file_stream, offsets.back(), sizeof(float));
3028 dump_data(mem, file_stream, offsets.back(), sizeof(short));
3029 data_names.push_back(dependency.as<data>().id());
3033 file_stream.close();
3036 //Makes serialization with given name.
3037 //Placeholder, not working yet, in progress.
3038 void program_impl::serialize(std::string network_name, std::function<bool(program_node const&)> const& filter) const
3040 std::vector<unsigned long long> offsets;
3041 std::vector<std::string> data_names;
3043 std::ofstream file_stream(network_name + "_" + "serialization" + ".bin", std::ios::binary);
3044 dump_kernels(engine->get_context().get()->get_kernels_cache().get_context().get_binaries(), offsets, data_names, file_stream);
3045 dump_weights_and_biasses(offsets, data_names, file_stream);
3047 std::ofstream graph(network_name + "_" + "serialization" + ".xml");
3048 dump_to_xml(graph, *this, filter, offsets, data_names);