6d106886817c38fc0af80fd82573f5fc465b3ccc
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / program.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18
19 #include "program_impl.h"
20 #include "primitive_inst.h"
21 #include "layout_optimizer.h"
22 #include "constants_propagator.h"
23
24 #include "primitive_type.h"
25 #include "api/CPP/activation.hpp"
26 #include "api/CPP/eltwise.hpp"
27 #include "api/CPP/input_layout.hpp"
28 #include "api/CPP/pooling.hpp"
29 #include "api/CPP/proposal.hpp"
30 #include "api/CPP/roi_pooling.hpp"
31 #include "api/CPP/reorg_yolo.hpp"
32
33 #include "activation_inst.h"
34 #include "batch_norm_inst.h"
35 #include "internal_primitive.h"
36 #include "internal_primitive_type_base.h"
37 #include "convolution_inst.h"
38 #include "concatenation_inst.h"
39 #include "crop_inst.h"
40 #include "data_inst.h"
41 #include "mutable_data_inst.h"
42 #include "deconvolution_inst.h"
43 #include "detection_output_inst.h"
44 #include "lrn_inst.h"
45 #include "normalize_inst.h"
46 #include "permute_inst.h"
47 #include "prior_box_inst.h"
48 #include "reorder_inst.h"
49 #include "reshape_inst.h"
50 #include "scale_inst.h"
51 #include "embed_inst.h"
52 #include "softmax_inst.h"
53 #include "split_inst.h"
54 #include "program_dump_graph.h"
55 #include "upsampling_inst.h"
56 #include "eltwise_inst.h"
57 #include "fully_connected_inst.h"
58 #include "mvn_inst.h"
59 #include "lstm_inst.h"
60 #include "lstm_gemm_inst.h"
61 #include "lstm_elt_inst.h"
62 #include "embed_inst.h"
63
64 #include "network_impl.h"
65 #include "kernel_selector_helper.h"
66 #include "sliding_window_utils.h"
67 #include "error_handler.h"
68
69 #include <fstream>
70 #include <algorithm>
71 #include <stdio.h>
72 #include <iostream>
73 #include <sstream>
74 #include <iomanip>
75
76
77 namespace {
78
79     //helper function for selecting function basing on the type of the given primitive
80     //this is the termination case for parameter pack recurrence, see overload below for logic
81     template <class... T>
82     void do_for_types(program_node&)
83     {
84         return;
85     }
86
87     //helper function for selecting function basing on the type of the given primitive
88     //this function should be explicitly given set of types and implicitly set of functions.
89     //both sets should have equal size. First function will be called if type of the given primitive
90     //will match first explicitly given type, second will be called if it matches second explicitly given
91     //type etc.
92     //Functions given as arguments should themselves take std::shared_ptr<const T> as argument
93     //where T is the type that should be match if this function should be called
94     //
95     //example:
96     // do_for_types<
97     //      convolution,
98     //      pooling
99     //  >(primitive,
100     //      [](typed_program_node<convolution>&){ do something if 'primitive' is a convolution },
101     //      [](typed_program_node<pooling>&)    { do something if 'primitive' is a pooling }
102     //  );
103     template <class T, class... RestOfT, class Func, class... RestOfFuncs>
104     decltype(static_cast<void>(std::declval<Func>()(std::declval<typed_program_node<T>&>()))) do_for_types(
105         program_node& node,
106         Func const& func,
107         RestOfFuncs const&... rest)
108     {
109         if (node.type() == T::type_id())
110             func(node.as<T>());
111         else
112             do_for_types<RestOfT...>(node, rest...);
113     }
114
115     template <class T>
116     struct single_element_container
117     {
118         single_element_container(T& t) : elem(&t)
119         {}
120
121         constexpr size_t size() const { return 1; }
122         single_element_container<T> begin() const { return single_element_container(elem); }
123         single_element_container<T> end() const { return single_element_container(nullptr); }
124         single_element_container<T>& operator ++() { elem = nullptr; return *this; }
125         bool operator !=(single_element_container const& sec) { return elem != sec.elem; }
126
127         T operator *() { return *elem; }
128
129     private:
130         single_element_container(T* t) : elem(t)
131         {}
132
133         T* elem;
134     };
135
136     //helper function which creates single-element array if it's given anything
137     //other than std::vector.
138     //It should be used in generic code when there's a need to force vector usage
139     //in foreach loop over variable which can in one context be a vector or a scalar
140     //in another.
141     //example:
142     // T t;
143     // for (auto& string : wrap_if_single(t.dump()))
144     //depending on type T, t.dump() may return either std::string or std::vector<std::string>,
145     //to ensure compatibility between these cases, wrap_if_single will create single-element
146     //container in case t.dump() would return plain std::string.
147     //
148     // T& case -> returns container which holds T&
149     template <class T>
150     single_element_container<T> wrap_if_single(T& t)
151     {
152         return single_element_container<T>(t);
153     }
154
155     //helper function which creates single-element array if it's given anything
156     //other than std::vector.
157     // T const& case -> returns container which holds T const&
158     template <class T>
159     single_element_container<T const> wrap_if_single(T const& t)
160     {
161         return single_element_container<T const>(t);
162     }
163
164     //helper function which creates single-element array if it's given anything
165     //other than std::vector.
166     // T&& case -> returns container which holds new instance of T created by moving given param
167     template <class T>
168     single_element_container<T> wrap_if_single(T&& t)
169     {
170         static_assert(meta::always_false<T>::value, "Wrapping temporary object into single_element_container is an error (requires valid reference)");
171         return single_element_container<T>(t);
172     }
173
174     //helper function which creates single-element array if it's given anything
175     //other than std::vector.
176     // std::vector case -> does not wrap, returns t as-is
177     primitive::fixed_size_vector_ref const& wrap_if_single(primitive::fixed_size_vector_ref const& t)
178     {
179         return t;
180     }
181
182     //helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
183     void merge_buffers(engine_impl::ptr engine, program_node &node, layout target_layout, size_t begin_offset, size_t end_offset)
184     {
185         memory_impl::ptr data_to_allocate = engine->allocate_memory(target_layout);
186
187         for (size_t i = begin_offset; i < end_offset; i++)
188         {
189             auto& weights = node.get_dependency(i).as<data>();
190             mem_lock<char> src{ weights.get_attached_memory() };
191             mem_lock<char> dst{ data_to_allocate };
192             std::copy(src.begin(), src.end(), dst.begin() + (i - begin_offset)*src.size());
193         }
194
195         for (size_t i = 0; i < end_offset - begin_offset - 1; i++)
196             node.remove_dependency(begin_offset + 1);
197
198         auto& data_node = node.get_dependency(begin_offset).as<data>();
199         data_node.attach_memory(*data_to_allocate, false);
200     }
201
202     //helper function for getting target layout used in depthwise sep optimization
203     layout get_weights_layout(typed_program_node<cldnn::data> &data_node, int32_t split)
204     {
205         auto mem_layout = data_node.get_output_layout();
206
207         return layout(mem_layout.data_type, mem_layout.format, { split * mem_layout.size.batch[0], mem_layout.size.feature[0], mem_layout.size.spatial[0], mem_layout.size.spatial[1] });
208     }
209
210     // pair.first tells whether l1 and l2 are absolutely identical
211     // pair.second tells whether l1 and l2 can be reinterpreted to each other without need of reordering
212     // note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted nor dropped)
213     // note: if layouts describe two buffers with different size, consider them not to be identical even if smaller buffer can be considered to hold subsequence of larger buffer,
214     //       this behavior is required to force buffer allocation for smaller buffer which, currently, should always be performed
215     std::pair<bool, bool> are_layouts_identical(layout const& l1, layout const& l2)
216     {
217         if (l1 == l2)
218             return{ true, true };
219         if (l1.data_type != l2.data_type)
220             return{ false, false };
221         if (l1.size != l2.size)
222             return{ false, false };
223         if (l1.get_linear_size() != l2.get_linear_size())
224             return{ false, false };
225         if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
226             (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16))
227             return{ false, false };
228
229         auto l1_pitch = l1.get_pitches();
230         auto l2_pitch = l2.get_pitches();
231
232         //ignore pitches which will never be used (for dims with size == 1)
233         for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i)
234             if (l1.size.raw[i] == 1)
235                 l1_pitch.raw[i] = 0;
236         for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i)
237             if (l2.size.raw[i] == 1)
238                 l2_pitch.raw[i] = 0;
239
240         auto l1_offset = l1.get_linear_offset();
241         auto l2_offset = l2.get_linear_offset();
242         if (l1_pitch == l2_pitch && l1_offset == l2_offset)
243             return{ false, true };
244
245         return{ false, false };
246     }
247 }
248
249 program_impl::program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal)
250     : engine(&engine_ref), options(options), output_size_handling_enabled(true)
251 {
252     static std::atomic<uint32_t> id_gen{ 0 };
253     prog_id = ++id_gen;
254     assert(prog_id != 0);
255
256     if ((options.get<build_option_type::tuning_config>()->config.mode == tuning_mode::tuning_tune_and_cache) &&
257         !engine->configuration().enable_profiling)
258     {
259         throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!");
260     }
261
262     init_graph(topology);
263     pre_optimize_graph();
264     compile_graph();
265     post_optimize_graph();
266
267     engine->compile_program(*this);
268     this->dump_program("13_finished", true);
269
270     //Makes serialization with given name.
271     //Placeholder, not working yet, in progress.
272     auto serialization_network_name = get_serialization_network_name(options);
273     if (!serialization_network_name.empty() && !is_internal)
274     {
275         this->serialize(serialization_network_name);
276     }
277
278     cleanup();
279 }
280
281 // TODO: Remove once we will get full support for input/output padding in all primitive implementations.
282 void program_impl::analyze_output_size_handling_need()
283 {
284     bool handling_needed = false;
285
286     // Calculate output size and compare with specified.
287     for (const auto& node : processing_order)
288     {
289         if (node->is_type<convolution>())
290         {
291             auto& prim_node = node->as<convolution>();
292             const auto& prim = prim_node.get_primitive();
293
294             if (!prim->with_output_size)
295                 continue;
296
297             tensor specified_output_range({ 0, 0, prim->output_size.spatial[0], prim->output_size.spatial[1] }, 1);
298
299             auto filter_size = prim_node.weights(0).get_output_layout().size;
300
301             auto calc_output_range = calc_sliding_window_output_range<swor_mode::all>(
302                 prim_node.input().get_output_layout().size,
303                 filter_size, prim->input_offset, prim->stride, prim->dilation, true, 1);
304
305             if (specified_output_range != calc_output_range)
306                 handling_needed = true;
307         }
308         else if (node->is_type<deconvolution>())
309         {
310             auto& prim_node = node->as<deconvolution>();
311             const auto& prim = prim_node.get_primitive();
312
313             if (!prim->with_output_size)
314                 continue;
315
316             tensor specified_output_range({ 0, 0, prim->output_size.spatial[0], prim->output_size.spatial[1] }, 1);
317
318             auto filter_size = prim_node.weights(0).get_output_layout().size;
319
320             auto calc_output_range = calc_sliding_window_needed_input_range(
321                 prim_node.input().get_output_layout().size,
322                 filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
323
324             if (specified_output_range != calc_output_range)
325                 handling_needed = true;
326         }
327         else if (node->is_type<pooling>())
328         {
329             auto& prim_node = node->as<pooling>();
330             const auto& prim = prim_node.get_primitive();
331
332             if (!prim->with_output_size)
333                 continue;
334
335             tensor specified_output_range({ 0, 0, prim->output_size.spatial[0], prim->output_size.spatial[1] }, 1);
336
337             // TODO: Check compatibility of output size calculation (with caffe).
338             auto calc_output_range = calc_sliding_window_output_range<swor_mode::exceed_once_data>(
339                 prim_node.input().get_output_layout().size,
340                 prim->size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
341
342             if (specified_output_range != calc_output_range)
343                 handling_needed = true;
344         }
345     }
346
347     output_size_handling_enabled = handling_needed;
348 }
349
350 std::list<std::shared_ptr<program_node>> program_impl::get_nodes() const
351 {
352     std::list<std::shared_ptr<program_node>> ret;
353
354     for (auto& node : processing_order)
355         ret.push_back(nodes_map.at(node->id()));
356     return ret;
357 }
358
359 void program_impl::init_graph(topology_impl const& topology)
360 {
361     auto const& topo_map = topology.get_primitives();
362     for (auto const& prim : topo_map)
363     {
364         auto& n = get_or_create(prim.second);
365         inputs.push_back(&n);
366     }
367     replace_nodes_pre();
368
369     for (auto itr = inputs.begin(); itr != inputs.end(); )
370     {
371         auto node_itr = itr++;
372         auto& node = (*node_itr);
373         auto deps = node->get_primitive()->dependencies();
374         if (deps.empty())
375             continue;
376
377         //add pointers to node's dependencies
378         for (auto& dep : deps)
379         {
380             try {
381                 auto dep_node = nodes_map.at(dep);
382                 node->dependencies.push_back(dep_node.get());
383                 dep_node->users.push_back(node);
384             }
385             catch (...) {
386                 throw std::runtime_error("Program doesn't contain primitive: " + dep +
387                     " that is input to: " + node->get_primitive()->id);
388             }
389         }
390
391         //primitive has dependencies so remove it from 'inputs'
392         inputs.erase(node_itr);
393     }
394
395     replace_nodes_post();
396     handle_lstm();
397     set_outputs();
398     calc_processing_order();
399
400     dump_program("0_init", true);
401
402     calc_prior_boxes(); dump_program("1_calculated_prior_boxes", true);
403     mark_constants();
404     mark_data_flow();
405     dump_program("2_analyzed_graph", true);
406 }
407
408 void program_impl::pre_optimize_graph()
409 {
410     trim_to_outputs(); dump_program("3_trimmed", true);
411     calculate_BFS_processing_order();
412     analyze_output_size_handling_need();
413     for (auto& node : processing_order)
414     {
415         if (!node->is_type<internal_primitive>() && !node->is_type<data>())
416             node->get_output_layout();
417     }
418
419     if (options.get<build_option_type::optimize_data>()->enabled())
420     {
421         prepare_primitive_fusing();
422         layout_optimizer lo(output_size_handling_enabled);
423         reorder_inputs(lo);
424         // this code should move to post compilation after kernel selector will support handling reorder bias
425         pre_optimize_bias(lo);
426         dump_program("4_reordered_inputs", true);
427     }
428
429     handle_reshape();
430     remove_redundant_reorders(); dump_program("5_removed_redundant_reorders", true);
431     prepare_padding();
432     prepare_depthwise_sep_opt();
433
434     propagate_constants(); dump_program("6_propagated_constants", true);
435
436     //try to fuse buffers (i.e. depth_concat in bfyx format) after padding calculations
437     if (options.get<build_option_type::optimize_data>()->enabled())
438     {
439         prepare_buffer_fusing();
440     }
441
442     dump_program("7_pre_optimized", true);
443 }
444
445 void program_impl::compile_graph()
446 {
447     for (auto& node : processing_order)
448     {
449         if (!node->is_type<internal_primitive>() && !node->is_type<data>())
450         {
451             node->get_output_layout();
452             if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()))
453                 node->selected_impl = node->type()->choose_impl(*engine, *node);
454         }
455     }
456
457     dump_program("8_compiled", true);
458 }
459
460 void program_impl::post_optimize_graph()
461 {
462     layout_optimizer lo;
463     post_optimize_weights(lo); dump_program("9_reordered_weights", true);
464     remove_redundant_reorders(); dump_program("10_removed_redundant_reorders", true); //TODO: do we need it at this place also?
465     propagate_constants(); dump_program("11_propagated_constants", true);
466     prep_opt_depthwise_sep_post();
467     update_processing_numbers(); dump_program("12_validated_processing_order", true);
468     prepare_memory_dependencies();
469 }
470
471 void program_impl::cleanup()
472 {
473     for (auto& node : processing_order)
474         if (!node->is_type<internal_primitive>())
475             node->get_output_layout();
476
477     //in debug build, at the end, mark all nodes as outputs so user can query for buffers of all not-optimized nodes, including internal ones etc.
478     if (is_debug_build())
479     {
480         for (auto& node : processing_order)
481         {
482             if (!node->is_output())
483             {
484                 node->set_output(true);
485                 outputs.push_back(node);
486             }
487         }
488     }
489 }
490
491 std::string get_id_string(size_t i) {
492     std::stringstream ss;
493     ss << std::setw(5) << std::setfill('0') << i;
494     return ss.str();
495 }
496
497 void program_impl::replace_nodes_pre()
498 {
499     auto itr = nodes_map.begin();
500     while (itr != nodes_map.end())
501     {
502         auto node_itr = itr++;
503         auto& node = (*node_itr).second;
504
505         //find split primitives and create crop primitives out of them
506         if (node->is_type<split>())
507         {
508             auto split_prim = node->as<split>().typed_desc();
509             primitive_id input_id = split_prim->input[0];
510             auto split_num = split_prim->output_offsets.size();
511
512             //create crop for each split ouptut provided
513             for (decltype(split_num) i = 0; i < split_num; i++)
514             {
515                 primitive_id output_id = node->id() + ":" + split_prim->output_ids[i];
516
517                 //create dummy crop primitive and add it to nodes map
518                 auto crop_prim = std::make_shared<crop>(output_id, input_id, tensor{ 1,1,1,1 }, split_prim->output_offsets[i]);
519                 get_or_create(crop_prim);
520             }
521         }
522     }
523 }
524
525
526 void program_impl::replace_nodes_post()
527 {
528     auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed
529     while (itr != nodes_map.end())
530     {
531         auto node_itr = itr++;
532         auto& node = (*node_itr).second;
533
534         //find split primitives and create crop primitives out of them
535         if (node->is_type<split>())
536         {
537             //check if split is not used by any primitive, as it will be optimized
538             if (node->get_users().size() != 0)
539                 throw std::logic_error("Split layer cannot be used directly! Please use split output \"" + node->id() + ":<split_output_id>\"!");
540
541             //get_output size and validate split primitive inputs
542             auto output_layout = node->get_output_layout();
543             auto output_layout_size = output_layout.size;
544
545             auto split_prim = node->as<split>().typed_desc();
546             primitive_id input_id = split_prim->input[0];
547             auto split_num = split_prim->output_offsets.size();
548
549             //create crop for each split ouptut provided
550             for (decltype(split_num) i = 0; i < split_num; i++)
551             {
552                 primitive_id output_id = node->id() + ":" + split_prim->output_ids[i];
553
554                 auto node_ptr = nodes_map.find(output_id)->second;
555
556                 //calculate crop reference input size
557                 tensor reference_input_size;
558
559                 for (decltype(split_num) j = 0; j < i; j++)
560                     reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j];
561
562                 for (decltype(split_num) j = i; j < split_num - 1; j++)
563                     reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j];
564
565                 reference_input_size = output_layout_size - reference_input_size;
566
567                 //update crop primitive and add connections
568                 node_ptr->set_output_padding(output_layout.data_padding);
569                 auto crop_prim = node_ptr->as<crop>().typed_desc();
570                 crop_prim->reference_input = reference_input_size;
571
572                 add_connection(node->get_dependency(0), *node_ptr);
573             }
574
575             //remove input->split connection and remove original split node
576             remove_connection(node->get_dependency(0), *node);
577             optimized_out.push_back(node->id());
578             nodes_map.erase(node->id());
579             continue;
580         }
581
582         //find upsampling primitives with bilinear filtering and create deconvolution with proper weights instead
583         if (node->is_type<upsampling>())
584         {
585             auto upsampling_prim = node->as<upsampling>().typed_desc();
586
587             if (upsampling_prim->sample_type != upsampling_sample_type::bilinear)
588                 continue;
589
590             //check if num_filter is not 0 (required for bilinear upsampling)
591             if (upsampling_prim->num_filter == 0)
592                 throw std::logic_error("num_filter in upsampling cannot be 0 in bilinear filtering mode in \"" + node->id() + "\"!");
593
594             primitive_id upsampling_id = node->id();
595             auto& input_node = node->get_dependency(0);
596
597             primitive_id input_id = upsampling_prim->input[0];
598             auto num_filter = upsampling_prim->num_filter;
599
600             //setting deconvolution parameters based on upsampling input
601             auto scale = static_cast<tensor::value_type>(upsampling_prim->scale);
602             tensor stride(1, 1, scale, scale);
603             auto offset = static_cast<tensor::value_type>(std::ceil((scale - 1) / 2.f));
604             tensor input_offset(0, 0, -offset, -offset);
605
606             //setting weights for deconvolution
607             auto kernel_size = static_cast<tensor::value_type>((2 * scale) - (scale % 2));
608             layout weights_layout(data_types::f32, format::bfyx, tensor(1, 1, kernel_size, kernel_size));
609
610             std::vector<primitive_id> weights_vec;
611             for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++)
612             {
613                 memory_impl::ptr data_to_allocate = engine->allocate_memory(weights_layout);
614                 mem_lock<float> dst{ data_to_allocate };
615                 float *dst_data = dst.data();
616                 //initialize with bilinear weights data
617                 auto f = static_cast<uint32_t>(std::ceil(kernel_size / 2.0f));
618                 float c = (2 * f - 1 - f % 2) / (2.f * f);
619                 float x = 0.f;
620                 float y = 0.f;
621                 for (size_t i = 0; i < weights_layout.count(); ++i) {
622                     x = static_cast<float>(i % kernel_size);
623                     y = static_cast<float>((i / kernel_size) % kernel_size);
624                     dst_data[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
625                 }
626
627                 //create weights primitive, with dummy memory which will be replaced in firther step
628                 primitive_id weights_id = upsampling_id + "_deconvolution_weights" + std::to_string(weights_idx);
629                 layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1));
630                 float zero = 0.f;
631                 auto weights_prim = std::make_shared<data>(weights_id, memory::attach(dummy_layout, &zero, 1));
632                 get_or_create(weights_prim);
633
634                 weights_vec.push_back(weights_id);
635
636                 auto weights_node_ptr = nodes_map.find(weights_id)->second;
637
638                 //attach weights buffer
639                 auto& data_node = weights_node_ptr->as<data>();
640                 data_node.attach_memory(*data_to_allocate, false);
641             }
642
643             //remove upsampling node, rename it and move to the optimized list
644             remove_connection(node->get_dependency(0), *node);
645             auto rename_id = upsampling_id + "_tmp";
646             rename(*node, rename_id);
647
648             //create deconvolution primitive
649             auto deconv_prim = std::make_shared<deconvolution>(upsampling_id, input_id, weights_vec, stride, input_offset);
650             get_or_create(deconv_prim);
651
652             auto deconv_node_ptr = nodes_map.find(upsampling_id)->second;
653
654             auto upsampling_node_ptr = nodes_map.find(rename_id)->second;
655             replace_all_usages(*upsampling_node_ptr, *deconv_node_ptr);
656             optimized_out.push_back(rename_id);
657             nodes_map.erase(rename_id);
658
659             //add connections input->deconvolution and weights->deconvolution
660             add_connection(input_node, *deconv_node_ptr);
661
662             for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++)
663             {
664                 auto weights_node_ptr = nodes_map.find(weights_vec[weights_idx])->second;
665                 add_connection(*weights_node_ptr, *deconv_node_ptr);
666             }
667             continue;
668         }
669
670         //find deconvolution primitives with stride 1 and change them to convolution with trasposed weights
671         if (node->is_type<deconvolution>())
672         {
673             if (!options.get<build_option_type::optimize_data>()->enabled())
674                 continue;
675
676             auto deconv_prim = node->as<deconvolution>().typed_desc();
677
678             //limit optimization to stride = 1
679             if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient())
680                 continue;
681
682             primitive_id deconv_id = node->id();
683             auto& input_node = node->get_dependency(0);
684
685             primitive_id input_id = deconv_prim->input[0];
686
687             //setting convolution parameters based on deconvolution params
688             auto stride = deconv_prim->stride;
689             auto weights = deconv_prim->weights;
690             std::vector<primitive_id> weights_vec;
691             for (auto& weights_id : weights)
692                 weights_vec.push_back(weights_id);
693             auto biases = deconv_prim->bias;
694             std::vector<primitive_id> bias_vec;
695             for (auto& bias_id : biases)
696                 bias_vec.push_back(bias_id);
697             auto input_offset = deconv_prim->input_offset;
698             auto with_activation = deconv_prim->with_activation;
699             auto activation_negative_slope = deconv_prim->activation_negative_slope;
700             auto output_padding = deconv_prim->output_padding;
701
702             //remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list
703             tensor filter_size = { 1, 1, 1, 1 };
704             remove_connection(node->get_dependency(0), *node);
705             for (auto& weights_id : weights_vec)
706             {
707                 auto weights_node_ptr = nodes_map.find(weights_id)->second;
708                 remove_connection(*weights_node_ptr, *node);
709                 //get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls have same size
710                 if (weights_id == weights_vec[0])
711                     filter_size = weights_node_ptr->get_output_layout().size;
712             }
713
714             input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
715             input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
716
717             if (!bias_vec.empty())
718             {
719                 for (auto& bias_id : bias_vec)
720                 {
721                     auto bias_id_node_ptr = nodes_map.find(bias_id)->second;
722                     remove_connection(*bias_id_node_ptr, *node);
723                 }
724             }
725             auto rename_id = deconv_id + "_tmp";
726             rename(*node, rename_id);
727
728             //create convolution primitive
729             if (biases.size() != 0)
730             {
731                 auto conv_prim = std::make_shared<convolution>(deconv_id, input_id, weights_vec, bias_vec,
732                     stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding);
733                 get_or_create(conv_prim);
734             }
735             else
736             {
737                 auto conv_prim = std::make_shared<convolution>(deconv_id, input_id, weights_vec,
738                     stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding);
739                 get_or_create(conv_prim);
740             }
741
742             auto conv_node_ptr = nodes_map.find(deconv_id)->second;
743             auto conv_node = &conv_node_ptr->as<convolution>();
744             conv_node->set_transposed(true);
745
746             //add connections input->convolution, weights->convolution and bias->convolution
747             add_connection(input_node, *conv_node_ptr);
748
749             for (auto& weights_id : weights_vec)
750             {
751                 auto weights_node_ptr = nodes_map.find(weights_id)->second;
752                 add_connection(*weights_node_ptr, *conv_node_ptr);
753             }
754
755             if (!bias_vec.empty())
756             {
757                 for (auto& bias_id : bias_vec)
758                 {
759                     auto bias_id_node_ptr = nodes_map.find(bias_id)->second;
760                     add_connection(*bias_id_node_ptr, *conv_node_ptr);
761                 }
762             }
763
764             auto deconv_node_ptr = nodes_map.find(rename_id)->second;
765             replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
766             optimized_out.push_back(rename_id);
767             nodes_map.erase(rename_id);
768
769             continue;
770         }
771     }
772 }
773
774 void program_impl::handle_lstm()
775 {
776     bool has_lstm_children;
777     auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed
778     while (itr != nodes_map.end())
779     {
780         auto node_itr = itr++;
781         auto& node = (*node_itr).second;
782         has_lstm_children = false;
783         // replace lstm node with lstm_gemm and lstm_elt nodes
784         if (node->is_type<lstm>()) {
785             bool initial_hidden_term = node->as<lstm>().initial_hidden_term();
786             bool initial_cell_term = node->as<lstm>().initial_cell_term();
787             bool bias_term = node->as<lstm>().bias_term();
788             auto lstm_prim = node->as<lstm>().typed_desc();
789             primitive_id weights_id = lstm_prim->weights;
790             primitive_id recurrent_id = lstm_prim->recurrent;
791             primitive_id bias_id = bias_term ? lstm_prim->bias : "";
792             primitive_id initial_hidden_id = initial_hidden_term ? lstm_prim->initial_hidden : "";
793             primitive_id initial_cell_id = initial_cell_term ? lstm_prim->initial_cell : "";
794             //removing connection with weights to get proper dependency order for next operations
795             remove_connection(*nodes_map.at(weights_id), *node);
796             remove_connection(*nodes_map.at(recurrent_id), *node);
797             if (bias_term)
798                 remove_connection(*nodes_map.at(bias_id), *node);
799             if (initial_hidden_term)
800                 remove_connection(*nodes_map.at(initial_hidden_id), *node);
801             if (initial_cell_term)
802                 remove_connection(*nodes_map.at(initial_cell_id), *node);
803
804             //calculating sizes
805             auto input_size = node->get_dependency(0).get_output_layout().size;
806             auto recurrent_size = nodes_map.at(recurrent_id)->get_output_layout().size;
807             auto hidden_size = tensor(input_size.batch[0], 1, recurrent_size.spatial[0], input_size.feature[0]);
808             size_t directions = recurrent_size.feature[0];
809             size_t input_dependencies = node->get_dependencies().size();
810             size_t sequence_len = node->as<lstm>().sequence_len();
811
812             //if the sequence has a single element but it has multiple inputs then
813             //the parent of this lstm is an lstm node. If this is a bidirectional lstm
814             //then the sequence length is the number of dependencies divided by 2.
815             if (sequence_len == 1 && input_dependencies > 1)
816                 sequence_len = (directions == 1) ? input_dependencies : input_dependencies / 2;
817
818             //check if this lstm node has an lstm child
819             for (auto& user : node->get_users())
820             {
821                 if (user->is_type<lstm>())
822                 {
823                     has_lstm_children = true;
824                 }
825             }
826
827             std::vector<program_node*> cell_list(directions * sequence_len);
828             std::vector<program_node*> concat_depends(directions * sequence_len);
829             std::vector<primitive_id> output_ids_offsets(directions * sequence_len);
830
831             primitive_id hidden_fwd_id = initial_hidden_id;
832             primitive_id hidden_bwd_id = initial_hidden_id;
833             primitive_id cell_fwd_id = initial_cell_id;
834             primitive_id cell_bwd_id = initial_cell_id;
835
836             auto split_direction = [&](const std::string gate, bool initial_term, primitive_id& fwd_id, primitive_id& bwd_id) {
837                 if (initial_term) {
838                     primitive_id initial_id = fwd_id;
839                     fwd_id = node->id() + ":" + gate + "_fwd";
840                     auto fwd_node = std::make_shared<crop>(fwd_id, initial_id, hidden_size, tensor{ 0,0,0,0 });
841                     auto &n1 = get_or_create(fwd_node);
842                     add_connection(*nodes_map.at(initial_id), n1);
843                     bwd_id = node->id() + ":" + gate + "_bwd";
844                     auto bwd_node = std::make_shared<crop>(bwd_id, initial_id, hidden_size, tensor{ 0,1,0,0 });
845                     auto &n2 = get_or_create(bwd_node);
846                     add_connection(*nodes_map.at(initial_id), n2);
847                 }
848             };
849
850             //if bidirectional lstm then initial_hidden and initial_cell terms need to be split
851             if (directions > 1) {
852                 split_direction("hidden", initial_hidden_term, hidden_fwd_id, hidden_bwd_id);
853                 split_direction("cell", initial_cell_term, cell_fwd_id, cell_bwd_id);
854             }
855
856             //lstm expanding
857             for (size_t dir = 0; dir < directions; ++dir) {
858                 auto hidden_id = dir == 0 ? hidden_fwd_id : hidden_bwd_id;
859                 auto cell_id = dir == 0 ? cell_fwd_id : cell_bwd_id;
860                 for (size_t i = 0; i < sequence_len; ++i) {
861                     size_t idx = i + dir * sequence_len;
862                     primitive_id lstm_gemm_id = node->id() + ":lstm_gemm" + get_id_string(idx);
863                     primitive_id lstm_elt_id = node->id() + ":lstm_elt" + get_id_string(idx);
864                     primitive_id crop_id = node->id() + ":crop" + get_id_string(idx);
865
866                     size_t input_idx = i;
867                     //for bidirectional lstms, if first LSTM layer then reverse input
868                     //for subsequent stacked layers the input is strided on the dir dimension
869                     if (directions > 0) {
870                         if (input_dependencies > sequence_len) { // stacked layer
871                             input_idx = dir * sequence_len + i;
872                         }
873                         else
874                         {
875                             if (dir > 0) { // first layer
876                                 input_idx = sequence_len - i - 1;
877                             }
878                         }
879                     }
880                     primitive_id lstm_gemm_input_id = node->get_dependency(input_idx).get_org_primitive_id();
881
882                     auto lstm_gemm_node = std::make_shared<lstm_gemm>(lstm_gemm_id, lstm_gemm_input_id, weights_id, recurrent_id, bias_id, hidden_id, (uint32_t)dir);
883                     auto &n1 = get_or_create(lstm_gemm_node);
884
885                     auto lstm_elt_node = std::make_shared<lstm_elt>(lstm_elt_id, lstm_gemm_id, cell_id, lstm_prim->clip, lstm_prim->input_forget,
886                         lstm_prim->activations, lstm_prim->activation_params, lstm_prim->offset_order);
887                     auto &n2 = get_or_create(lstm_elt_node);
888                     //adding lstm_elt as user
889                     add_connection(n1, n2);
890                     //adding dependecy to lstm_gemm node
891                     //input
892                     add_connection(node->get_dependency(input_idx), n1);
893                     //adding weights and initial values to lstm_gemm
894                     add_connection(*nodes_map.at(weights_id), n1);
895                     add_connection(*nodes_map.at(recurrent_id), n1);
896                     if (bias_term)
897                         add_connection(*nodes_map.at(bias_id), n1);
898
899                     //adding cell and hiddens as dependencies
900                     if (i > 0)
901                     {
902                         add_connection(*cell_list[size_t(i - 1) * directions + dir], n2);
903                         add_connection(*(concat_depends[size_t(i - 1) * directions + dir]), n1);
904                     }
905                     //if initial values are present
906                     else
907                     {
908                         if (initial_hidden_term)
909                             add_connection(*nodes_map.at(hidden_id), n1);
910                         if (initial_cell_term)
911                             add_connection(*nodes_map.at(cell_id), n2);
912                     }
913
914                     //lstm_hidden
915                     hidden_id = crop_id + ":hidden";
916                     auto crop_hidden = std::make_shared<crop>(hidden_id, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 });
917                     auto &n3 = get_or_create(crop_hidden);
918                     //adding eltwise as dependency to hidden
919                     add_connection(n2, n3);
920
921                     //if parent is lstm adding hiddens as dependency
922                     if (has_lstm_children)
923                     {
924                         for (auto& user : node->get_users())
925                         {
926                             add_connection(n3, *user);
927                         }
928                     }
929                     concat_depends[i * directions + dir] = &n3;
930
931                     //lstm_cell
932                     if (i < sequence_len - 1) {
933                         cell_id = crop_id + ":cell";
934                         auto crop_cell = std::make_shared<crop>(cell_id, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 });
935                         auto &n4 = get_or_create(crop_cell);
936                         add_connection(n2, n4);
937                         cell_list[i * directions + dir] = &n4;
938                     }
939                     output_ids_offsets[i * directions + dir] = hidden_id;
940                 }
941             }
942
943             //if there is no next lstm, concatenation is created
944             if (!has_lstm_children)
945             {
946                 primitive_id original_id = node->id();
947                 primitive_id concatenation_id = original_id + ":concat";
948                 auto concatenation_primitive = std::make_shared<concatenation>(concatenation_id, output_ids_offsets, concatenation::along_f);
949                 auto &concatenation_node = get_or_create(concatenation_primitive);
950                 for (auto sub_dependency : concat_depends)
951                 {
952                     add_connection(*sub_dependency, concatenation_node);
953                 }
954                 if (directions == 2) {
955                     // bidirectional support requires concatenations along the direction and sequence axis
956                     // instead we can concatenate along the sequence axis and reshape the tensor to the account
957                     // for the direction
958                     tensor output_size {input_size.batch[0], (int32_t)sequence_len, hidden_size.spatial[0], (int32_t)directions};
959                     primitive_id reshape_id = original_id + ":reshape";
960                     auto reshape_primitive = std::make_shared<reshape>(reshape_id, concatenation_id, output_size);
961                     auto &reshape_node = get_or_create(reshape_primitive);
962                     add_connection(concatenation_node, reshape_node);
963                     for (auto& user : node->get_users())
964                     {
965                         add_connection(reshape_node, *user);
966                     }
967                 }
968             }
969
970             //removing expanded node
971             remove_all_connections(*node);
972             nodes_map.erase(node->id());
973             continue;
974         }
975     }
976
977 }
978
979 void program_impl::set_outputs()
980 {
981     auto outputs_option = options.get<build_option_type::outputs>();
982     if (!outputs_option->outputs.empty())
983     {
984         for (auto const& output : outputs_option->outputs)
985         {
986             auto o_node = nodes_map.at(output);
987             o_node->set_output(true);
988             outputs.push_back(o_node.get());
989         }
990     }
991     else
992     {
993         for (auto& node : nodes_map)
994             if (node.second->is_endpoint())
995             {
996                 node.second->set_output(true);
997                 outputs.push_back(node.second.get());
998             }
999     }
1000 }
1001
1002 void program_impl::calc_processing_order()
1003 {
1004     processing_order.clear();
1005
1006     //run dfs to sort nodes topologically
1007     for (auto input : inputs)
1008     {
1009         if (input->is_marked())
1010             continue;
1011
1012         input->mark();
1013         std::list<std::pair<program_node*, std::list<program_node*>::const_iterator>> stack = { std::make_pair(input, input->users.begin()) };
1014
1015         while (!stack.empty()) //imitate call stack
1016         {
1017         new_frame:
1018             auto& frame = stack.back();
1019
1020             while (frame.second != frame.first->users.end())
1021             {
1022                 auto successor = *frame.second;
1023                 ++frame.second;
1024
1025                 if (!successor->is_marked())
1026                 {
1027                     successor->mark();
1028
1029                     //recurrence call
1030                     stack.push_back(std::make_pair(successor, successor->users.begin()));
1031                     goto new_frame;
1032                 }
1033             }
1034
1035             //we have finished processing one node so add it to the processing queue
1036             processing_order.push_front(frame.first);
1037             frame.first->processing_itr = processing_order.begin();
1038
1039             //return from call
1040             stack.pop_back();
1041         }
1042     }
1043
1044     uint32_t idx = 0;
1045     for (auto& node : processing_order)
1046     {
1047         node->processing_num = ++idx;
1048         node->unmark();
1049     }
1050 }
1051
1052 void program_impl::update_processing_numbers()
1053 {
1054     uint32_t idx = 0;
1055     for (auto& node : processing_order)
1056     {
1057         node->processing_num = ++idx;
1058     }
1059
1060     for (auto& node : processing_order)
1061     {
1062         if (!processing_order_is_correct(node))
1063         {
1064             CLDNN_ERROR_MESSAGE(node->id(), "Incorrect processing order");
1065             return;
1066         }
1067     }
1068 }
1069
1070 void program_impl::calc_prior_boxes()
1071 {
1072     auto itr = processing_order.begin();
1073     while (itr != processing_order.end())
1074     {
1075         auto& node = (*itr++);
1076         if (!node->is_type<prior_box>())
1077             continue;
1078
1079         auto& pb_node = node->as<prior_box>();
1080
1081         pb_node.calc_result();
1082         remove_connection(pb_node.input(), pb_node);
1083
1084         auto& result = pb_node.get_result_buffer();
1085         result.add_ref(); // need to inc ref count since we will be assigning this memory as cldnn_memory in next line that is not ref_count_obj
1086         auto cpp_mem = details::memory_c_to_cpp_converter::convert(api_cast(&result));
1087
1088         auto& data_node = get_or_create(std::make_shared<data>("_cldnn_tmp_" + pb_node.id() + "_result", cpp_mem));
1089         replace(pb_node, data_node, false, false);
1090     }
1091 }
1092
1093
1094
1095 void program_impl::mark_constants()
1096 {
1097     for (auto& node : processing_order)
1098     {
1099         if (node->dependencies.empty())
1100             continue;
1101         if (node->is_type<prior_box>())
1102             continue;
1103
1104         node->constant = true;
1105         for (auto& dep : node->get_dependencies())
1106         {
1107             if (!dep->constant)
1108             {
1109                 node->constant = false;
1110                 break;
1111             }
1112         }
1113
1114         if (!node->constant)
1115             for (auto& dep : node->get_dependencies())
1116                 if (dep->constant)
1117                     dep->constant_frontier = true;
1118     }
1119 }
1120
1121 void program_impl::mark_data_flow()
1122 {
1123     std::list<program_node*> stack;
1124     for (auto const& node : processing_order)
1125     {
1126         if ((node->is_endpoint() && !node->constant) || node->is_type<mutable_data>())
1127         {
1128             stack.push_back(node);
1129             node->data_flow = true;
1130             node->mark();
1131         }
1132     }
1133
1134     while (!stack.empty())
1135     {
1136         auto node = stack.front();
1137         stack.pop_front();
1138
1139         size_t dep_idx = 0;
1140         size_t inputs_count = (node->is_type<internal_primitive>() ? node->get_dependencies().size() : node->get_primitive()->input.size());
1141         //TODO: remove this hack after addition of constants propagation pass
1142         if (node->is_type<detection_output>() || node->is_type<proposal>())
1143             inputs_count = 2; //ignore third input as it is related to prior boxes (i.e. concat of prior-boxes)
1144
1145         for (auto dep : node->get_dependencies())
1146         {
1147             bool data_flow = (dep_idx < inputs_count && !dep->constant);
1148             ++dep_idx;
1149             if (!data_flow)
1150                 continue;
1151
1152             dep->data_flow = data_flow;
1153
1154             if (dep->is_marked())
1155                 continue;
1156
1157             stack.push_back(dep);
1158             dep->mark();
1159         }
1160     }
1161
1162     for (auto& node : processing_order)
1163     {
1164         assert(!node->constant || !node->data_flow); //node which is constant cannot be marked as data flow
1165         node->unmark();
1166     }
1167 }
1168
1169 void program_impl::trim_to_outputs()
1170 {
1171     size_t actual_nodes = processing_order.size();
1172     if (!actual_nodes) //degenerated case but can happen
1173         return;
1174
1175     if (outputs.size() == actual_nodes)
1176         return;
1177
1178     //do backward bfs starting from all outputs
1179     std::list<const std::vector<program_node*>*> stack = { &outputs };
1180     while (!stack.empty())
1181     {
1182         auto nodes_list = stack.front();
1183         stack.pop_front();
1184
1185         for (auto node : *nodes_list)
1186         {
1187             if (!node->is_marked())
1188             {
1189                 node->mark();
1190                 if (!node->get_dependencies().empty())
1191                     stack.push_back(&node->get_dependencies());
1192             }
1193         }
1194     }
1195
1196     //all not-marked nodes should be removed
1197     std::list<program_node*> to_rem;
1198     for (auto node : processing_order)
1199     {
1200         if (node->is_type<input_layout>()) //input layout may become disconnected during prior boxes calculations so it may have not been marked at this place but we don't want to remove it
1201             node->mark();
1202         else if (!node->is_marked())
1203             to_rem.push_back(node);
1204     }
1205
1206     for (auto const& node : to_rem)
1207     {
1208         if (node->is_input())
1209             inputs.remove(node);
1210         else
1211         {
1212             for (auto dep : node->dependencies)
1213                 if (dep->is_marked())
1214                     dep->users.remove(node);
1215         }
1216
1217         for (auto user : node->users)
1218             if (user->is_marked())
1219                 user->dependencies.erase(std::remove(user->dependencies.begin(), user->dependencies.end(), node), user->dependencies.end());
1220
1221         optimized_out.push_back(node->id());
1222         nodes_map.erase(node->id());
1223     }
1224 }
1225
1226 void add_memory_dependency(program_node* node, program_node* dep)
1227 {
1228     if (node->can_be_optimized() ||
1229         !dep->can_be_optimized())
1230     {
1231         node->add_memory_dependency(dep->id());
1232     }
1233     else
1234     {
1235         if (node->id() == dep->id())
1236         {
1237             return;
1238         }
1239         for (auto subdep : dep->get_dependencies())
1240         {
1241             add_memory_dependency(node, subdep);
1242             add_memory_dependency(subdep, node);
1243         }
1244     }
1245 }
1246
1247 void program_impl::basic_memory_dependencies()
1248 {
1249     auto itr = processing_order.begin();
1250     std::vector<primitive_id> past_outputs;
1251     while (itr != processing_order.end())
1252     {
1253         auto& node = *itr;
1254         itr++;
1255
1256         //data primitive can't be reused
1257         if (node->is_type<data>())
1258             continue;
1259
1260         // add my dependencies to restriction list (can't share input.output buffers)
1261         for (auto it : node->get_dependencies())
1262         {
1263             add_memory_dependency(node, it);
1264             add_memory_dependency(it, node);
1265         }
1266
1267         // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, this output
1268         // has to land on the primitve restriction list. Otherwise memory reuse can corrupt final results.
1269         node->add_memory_dependency(past_outputs);
1270         // if current node is an output add it to the outputs list after restriction.
1271         if (node->is_output())
1272             past_outputs.push_back(node->id());
1273     }
1274 }
1275
1276 void program_impl::skipped_branch_memory_dependencies()
1277 {
1278     auto itr = processing_order.begin();
1279     // Primitive A can't use primitive B buffer if B->processing_num < A->processing_num and any of B users processing_num > A->processing_num
1280     // Otherwise it could override data that has to be used in the future.
1281     // TODO: improve algorithm to to O(n*log(n))
1282     while (itr != processing_order.end())
1283     {
1284         auto& node = *itr;
1285         itr++;
1286         auto itr2 = processing_order.begin();
1287         if (itr2 == itr)
1288             continue;
1289         while (itr2 != processing_order.end())
1290         {
1291             auto& node2 = *itr2;
1292             itr2++;
1293             if (node2->get_processing_num() < node->get_processing_num())
1294             {
1295                 // if at least one user will be processed after 'node', node2 has to be added to forbiden list
1296                 for (auto usr : node2->get_users())
1297                 {
1298                     if (usr->get_processing_num() > node->get_processing_num())
1299                     {
1300                         add_memory_dependency(node, node2);
1301                         add_memory_dependency(node2, node);
1302                         break;
1303                     }
1304                 }
1305             }
1306         }
1307     }
1308 }
1309
1310 void program_impl::oooq_memory_dependencies()
1311 {
1312     auto itr = processing_order.begin();
1313     // This order let us build dependencies based on syncing points.
1314     // Set of nodes between two syncing points will be called sync_region.
1315     // Major rules is: can't share resource with nodes in my sync_region
1316
1317     uint32_t last_barrier = 0;
1318     bool needs_barrier = false;
1319     std::vector<cldnn::program_node*> sync_region;
1320     while (itr != processing_order.end())
1321     {
1322         auto& node = *itr;
1323         itr++;
1324
1325         // if any of dep has proccess num after barrier -> needs barrier
1326         for (auto dep : node->get_dependencies())
1327         {
1328             if (dep->get_processing_num() >= last_barrier)
1329             {
1330                 needs_barrier = true;
1331                 break;
1332             }
1333         }
1334
1335         if (needs_barrier)
1336         {
1337             last_barrier = node->get_processing_num();
1338             needs_barrier = false;
1339             // add each pair bi-direction dependency
1340             for (auto nd1 = sync_region.begin(); nd1 + 1 != sync_region.end(); nd1++)
1341             {
1342                 for (auto nd2 = nd1 + 1; nd2 != sync_region.end(); nd2++)
1343                 {
1344                     add_memory_dependency(*nd1, *nd2);
1345                     add_memory_dependency(*nd2, *nd1);
1346                 }
1347             }
1348
1349             // collect dependencies of every node in sync region
1350             std::vector<cldnn::program_node*> deps;
1351             for (auto& nd_in_region : sync_region)
1352                 for (auto& dep : nd_in_region->get_dependencies())
1353                     deps.emplace_back(dep);
1354
1355
1356             for (auto& nd_in_region : sync_region)
1357                 for (auto& dep : deps)
1358                 {
1359                     add_memory_dependency(nd_in_region, dep);
1360                     add_memory_dependency(dep, nd_in_region);
1361                 }
1362
1363             sync_region.clear();
1364         }
1365         sync_region.push_back(node);
1366     }
1367 }
1368
1369 void program_impl::prepare_memory_dependencies()
1370 {
1371     if (!get_engine().configuration().enable_memory_pool)
1372         return;
1373
1374     basic_memory_dependencies();
1375     skipped_branch_memory_dependencies();
1376     oooq_memory_dependencies();
1377 }
1378
1379 std::string program_impl::get_memory_dependencies_string() const
1380 {
1381     std::string mem_dep = "Memory dependencies/restrictions:\n";
1382     auto itr = processing_order.begin();
1383     while (itr != processing_order.end())
1384     {
1385         auto& node = *itr;
1386         itr++;
1387         mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: ");
1388         for (auto it : node->get_memory_dependencies())
1389             mem_dep == mem_dep.append(it).append(", ");
1390         mem_dep = mem_dep.append("\n");
1391     }
1392     return mem_dep;
1393 }
1394
1395 void program_impl::remove_redundant_reorders()
1396 {
1397     auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed
1398     while (itr != processing_order.end())
1399     {
1400         auto& node = (*itr++); //post-inc to avoid invalidation due to possible erase
1401         if (!node->is_type<reorder>()) //only care for reorders
1402             continue;
1403
1404         program_node* current_node = node;
1405         std::vector<program_node*> r_nodes_to_remove;
1406
1407         auto optimize = true;
1408         while (current_node)
1409         {
1410             auto& r_node = current_node->as<reorder>();
1411             current_node = nullptr;
1412
1413             if (r_node.has_mean() || !r_node.get_primitive()->subtract_per_feature.empty() ||  //do not optimize if mean of subtract are present
1414                 (r_node.is_output() && r_node.get_dependency(0).is_output())) //do not optimize when both reorder and layer before are outputs
1415             {
1416                 optimize = false;
1417                 break;
1418             }
1419
1420             r_nodes_to_remove.push_back(&r_node);
1421
1422             if (r_node.get_dependency(0).is_type<reorder>() && r_node.get_dependencies().size() == 1 && r_node.get_users().size() == 1 && r_node.get_dependency(0).get_users().size() == 1)
1423                 current_node = &r_node.get_dependency(0);
1424         }
1425         if (!optimize)
1426             continue;
1427
1428         assert(node->dependencies.size() == 1 && "reorder without mean should have exactly one dependecy (input)");
1429         auto& r_output = r_nodes_to_remove.front();
1430         auto& r_input = r_nodes_to_remove.back()->get_dependency(0);
1431         auto o_layout = r_output->get_output_layout();
1432         auto i_layout = r_input.get_output_layout();
1433
1434         auto ident = are_layouts_identical(o_layout, i_layout);
1435         if (!ident.second)
1436             continue;
1437
1438         for (auto remove_reorder_node : r_nodes_to_remove)
1439         {
1440             auto& r_node = remove_reorder_node->as<reorder>();
1441
1442             if (ident.first && ident.second && r_node.is_output() && r_node.get_dependency(0).is_input()) //do not optimize when reorder is output and layer before is input
1443             {
1444                 optimize = false;
1445                 break;
1446             }
1447         }
1448         if (!optimize)
1449             continue;
1450
1451         for (auto remove_reorder_node : r_nodes_to_remove)
1452         {
1453             auto& r_node = remove_reorder_node->as<reorder>();
1454
1455             //mark as optimized
1456             r_node.can_be_optimized(true);
1457             r_node.requires_reinterpret(!ident.first);
1458             if (ident.first) //no need of reshape
1459                 extract_and_remove(r_node); //try to remove if possible (with respect to r_node not being marked as output)
1460         }
1461     }
1462 }
1463
1464 /*
1465     recalculate processing_order
1466     algorithm based on: CLRS 24.5 (critical path in DAG)
1467     modifications: adjust for multiple inputs
1468     input: any topological order in processing order
1469     output: BFS topological order.
1470 */
1471
1472 void program_impl::calculate_BFS_processing_order() {
1473     std::map<program_node*, int> distances;
1474     for (auto itr : processing_order)
1475     {
1476         distances[itr] = -1;
1477     }
1478     int max_distance = 0;
1479     for (auto itr : processing_order)
1480     {
1481         //Init
1482         if (distances[itr] == -1) {     // this must be an input
1483             distances[itr] = 0;         // initialize input
1484         }
1485         // RELAX
1486         for (auto& user : itr->get_users())
1487         {
1488             distances[user] = std::max(distances[user], distances[itr] + 1);
1489             max_distance = std::max(max_distance, distances[user]);
1490         }
1491     }
1492
1493     //bucket sort nodes based on their max distance from input
1494     std::vector<std::vector<program_node*>> dist_lists;
1495     dist_lists.resize(max_distance + 1);
1496     for (auto itr : processing_order)
1497     {
1498         dist_lists[distances[itr]].push_back(itr);
1499     }
1500
1501     //replace the old processing order by the new one, still topological.
1502     processing_order.clear();
1503     for (auto& dist : dist_lists)
1504     {
1505         for (auto& node : dist)
1506         {
1507             processing_order.push_back(node);
1508             node->processing_itr = processing_order.end();
1509             node->processing_itr--;
1510         }
1511     }
1512     update_processing_numbers();
1513     return;
1514 }
1515
1516 void program_impl::reorder_inputs(layout_optimizer& lo)
1517 {
1518     //first pass to set layout optimization_attributes for topology
1519     for (auto& p : nodes_map)
1520     {
1521         auto& prim = *p.second;
1522         if (prim.type() == cldnn::convolution::type_id())
1523         {
1524             if (prim.as<convolution>().get_primitive()->split() > 1)
1525                 lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1);
1526         }
1527
1528         //list of layers that do not support yxfb or perform worse than bfyx
1529         if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() ||
1530             prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() ||
1531             prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id())
1532             lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1);
1533     }
1534
1535     const auto reorder_input = [this, &lo](typed_program_node<convolution>& conv_node)
1536     {
1537         auto conv_prim = conv_node.get_primitive();
1538         auto& input_node = conv_node.get_dependency(0);
1539         auto&& weights_layout = conv_node.weights(0).get_output_layout();
1540         auto&& input_layout = input_node.get_output_layout();
1541
1542         std::shared_ptr<reorder> new_input = nullptr;
1543
1544         if (input_node.type() == reorder::type_id()) //convolution's input is a reorder
1545         {
1546             auto reorder_prim = input_node.as<reorder>().typed_desc();
1547             auto& reorder_input = input_node.get_dependency(0);
1548             auto reorder_layout = input_node.get_output_layout();
1549             reorder_layout.data_type = reorder_prim->output_data_type;
1550             new_input = lo.get_reorder(
1551                 reorder_layout,
1552                 reorder_prim->id,
1553                 layout_optimizer::data_type::input,
1554                 conv_node,
1555                 weights_layout).first;
1556
1557             auto reorder_removed = false;
1558             if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal
1559             {
1560                 auto reorder_input_layout = reorder_input.get_output_layout();
1561
1562                 auto opt_layout = layout(new_input->output_data_type, new_input->output_format, reorder_input_layout.size);
1563                 if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format
1564                 {
1565                     if (reorder_prim->subtract_per_feature.empty() &&
1566                         reorder_prim->mean.empty() &&
1567                         !reorder_prim->output_padding) //just plain reorder
1568                     {
1569                         conv_node.replace_dependency(0, reorder_input);
1570                         if (input_node.get_users().size() == 0 && !input_node.is_output())
1571                         {
1572                             reorder_removed = extract_and_remove(input_node);
1573                         }
1574                         new_input = nullptr;
1575                     }
1576                     else //change reorder's output layout
1577                     {
1578                         reorder_prim->output_format = opt_layout.format;
1579                         reorder_prim->output_data_type = opt_layout.data_type;
1580                         new_input = nullptr;
1581                     }
1582                 }
1583                 else //current reorder gives bad output, simply change it
1584                 {
1585                     reorder_prim->output_format = opt_layout.format;
1586                     reorder_prim->output_data_type = opt_layout.data_type;
1587                     new_input = nullptr;
1588                 }
1589             }
1590
1591             if (!reorder_removed)
1592                 input_node.recalc_output_layout();
1593             else
1594                 conv_node.recalc_output_layout();
1595         }
1596         else
1597         {
1598             new_input = lo.get_reorder(
1599                 input_node.get_output_layout(),
1600                 input_node.id(),
1601                 layout_optimizer::data_type::input,
1602                 conv_node,
1603                 weights_layout).first;
1604         }
1605
1606         if (new_input && new_input->output_format == format::winograd_2x3_s1_data)
1607         {
1608             auto lower_size = (conv_prim->input_offset.negate() + input_layout.size);
1609
1610             tensor upper_input_padding = tensor{ 0 };
1611             upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2;          //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0
1612             upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8;    //for y, y - 2 % 8 == 0 must hold
1613
1614             apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() });
1615
1616             auto winograd_output = std::make_shared<reorder>("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format,
1617                 input_layout.data_type, std::vector<float>{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding);
1618             conv_node.output_layout.data_padding = padding{};
1619             auto& back_node = get_or_create(winograd_output);
1620             back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node);
1621
1622             auto bias_term = conv_node.bias_term();
1623             //create additional eltwise node after reorder to compute bias
1624             if (bias_term)
1625             {
1626                 auto& bias_node = conv_node.get_dependency(2);
1627                 std::vector<primitive_id> inputs = { back_node.id(), bias_node.id() };
1628                 auto winograd_output_biases = std::make_shared<eltwise>(back_node.id() + "_bias", inputs,
1629                     cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope,
1630                     back_node.output_layout.data_padding);
1631                 back_node.output_layout.data_padding = padding{};
1632                 auto& back_bias_node = get_or_create(winograd_output_biases);
1633                 back_bias_node.processing_itr = processing_order.insert(std::next(back_node.processing_itr), &back_bias_node);
1634                 replace_all_usages(back_node, back_bias_node);
1635                 add_connection(back_node, back_bias_node);
1636                 add_connection(bias_node, back_bias_node);
1637                 conv_node.invalidate_users();
1638                 replace_all_usages(conv_node, back_bias_node);
1639             }
1640
1641             if (conv_prim->with_activation)
1642             {
1643                 conv_node.typed_desc()->with_activation = false;
1644                 if (!bias_term)
1645                     back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope, 0.0f });
1646             }
1647
1648             if (!bias_term)
1649             {
1650                 conv_node.invalidate_users();
1651                 replace_all_usages(conv_node, back_node);
1652             }
1653             add_connection(conv_node, back_node);
1654
1655             auto& r_node = get_or_create(new_input);
1656             r_node.as<reorder>().set_input_offset(conv_prim->input_offset);
1657
1658             if (!bias_term)
1659             {
1660                 swap_names(conv_node, back_node);
1661                 if (conv_node.is_output())
1662                 {
1663                     conv_node.set_output(false);
1664                     back_node.set_output(true);
1665                     for (auto& output : outputs)
1666                     {
1667                         if (output == &conv_node)
1668                         {
1669                             output = &back_node;
1670                             break;
1671                         }
1672                     }
1673                 }
1674             }
1675             else
1676             {
1677                 conv_node.remove_dependency(2);
1678                 auto& back_bias_node = *nodes_map.find(back_node.id() + "_bias")->second;
1679                 swap_names(conv_node, back_bias_node);
1680                 if (conv_node.is_output())
1681                 {
1682                     conv_node.set_output(false);
1683                     back_bias_node.set_output(true);
1684                     for (auto& output : outputs)
1685                     {
1686                         if (output == &conv_node)
1687                         {
1688                             output = &back_bias_node;
1689                             break;
1690                         }
1691                     }
1692                 }
1693             }
1694         }
1695
1696         if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf))
1697         {
1698             auto conv1x1_output = std::make_shared<reorder>("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type);
1699             auto& back_node = get_or_create(conv1x1_output);
1700             back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node);
1701
1702             conv_node.invalidate_users();
1703             replace_all_usages(conv_node, back_node);
1704             add_connection(conv_node, back_node);
1705         }
1706
1707         if (new_input)
1708         {
1709             auto& r_node = get_or_create(new_input);
1710             add_intermediate(r_node, conv_node, 0, r_node.dependencies.empty());
1711             conv_node.recalc_output_layout();
1712         }
1713     };
1714
1715     const auto reorder_input_detection_output = [this, &lo](typed_program_node<detection_output>& detection_output_node)
1716     {
1717         auto detection_output_prim = detection_output_node.get_primitive();
1718
1719         for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++)
1720         {
1721             auto& input = detection_output_node.get_dependency(i);
1722             std::shared_ptr<reorder> new_input = lo.get_reorder(
1723                 input.get_output_layout(),
1724                 input.id(),
1725                 layout_optimizer::data_type::input,
1726                 detection_output_node,
1727                 layout{ data_types::f32, format::bfyx, tensor{} }).first;
1728
1729             if (new_input)
1730             {
1731                 add_intermediate(new_input, detection_output_node, i);
1732             }
1733         }
1734     };
1735
1736     for (auto& prim : processing_order)
1737     {
1738         //there's an assumption that only convolution will take data/input_layout as input
1739         //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above
1740         do_for_types<convolution, detection_output>(*prim,
1741             reorder_input,                  //case for convolution
1742             reorder_input_detection_output  //case for detection-output
1743             );
1744     }
1745 }
1746
1747 //function which prepares given primitive for weights optimization
1748 template <typename T>
1749 void program_impl::optimize_bias(T& node, layout_optimizer& lo)
1750 {
1751     layout output_layout = node.get_output_layout();
1752
1753     size_t weights_offset = node.get_primitive()->input.size();
1754     size_t bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size();
1755     for (size_t i = bias_offset; i < node.get_dependencies().size(); ++i)
1756     {
1757         //find weights primitive with given pimitive_id and add it to weights_optimizer
1758         const program_node& bias = node.get_dependency(i);
1759         const auto bias_type = layout_optimizer::data_type::bias;
1760         auto reorder = lo.get_reorder(
1761                 bias.get_output_layout(),
1762                 bias.id(),
1763                 bias_type,
1764                 node,
1765                 output_layout);
1766
1767         if (reorder.first)
1768             this->add_intermediate(reorder.first, node, i, !reorder.second);
1769     }
1770 }
1771 template void program_impl::optimize_bias<convolution_node>(convolution_node& node, layout_optimizer& lo);
1772 template void program_impl::optimize_bias<deconvolution_node>(deconvolution_node& node, layout_optimizer& lo);
1773 template void program_impl::optimize_bias<fully_connected_node>(fully_connected_node& node, layout_optimizer& lo);
1774 template void program_impl::optimize_bias<embed_node>(embed_node& node, layout_optimizer& lo);
1775
1776 void program_impl::pre_optimize_bias(layout_optimizer& lo)
1777 {
1778     for (auto& p : nodes_map)
1779     {
1780         auto& prim = *p.second;
1781         if (prim.type() == convolution::type_id())
1782         {
1783             if (!prim.as<convolution>().weights_quantization_term())
1784                 optimize_bias(prim.as<convolution>(), lo);
1785         }
1786         else if (prim.type() == deconvolution::type_id())
1787         {
1788             optimize_bias(prim.as<deconvolution>(), lo);
1789         }
1790         else if (prim.type() == fully_connected::type_id())
1791         {
1792             if (!prim.as<fully_connected>().weights_quantization_term())
1793                 optimize_bias(prim.as<fully_connected>(), lo);
1794         }
1795         else if (prim.type() == embed::type_id())
1796         {
1797             optimize_bias(prim.as<embed>(), lo);
1798         }
1799     }
1800 }
1801
1802 template <typename T>
1803 void program_impl::optimize_depthwise_sep_pre(T& node)
1804 {
1805     //enable optimization only when IFM / split <= 8 (otherwise scheduling multiple opt kernels is better) and split >= 16
1806     if (!(node.get_dependency(0).get_output_layout().size.feature[0] / node.get_primitive()->split() <= 8) ||
1807         !(node.get_primitive()->split() >= 16))
1808         return;
1809
1810     //make sure the weights and biases are data type and
1811     //are not reused in other primitives as they will be overriden with concatenated ones
1812     for (size_t i = 1; i < node.get_dependencies().size(); i++)
1813     {
1814         auto& weights_or_biases = node.get_dependency(i);
1815         if (weights_or_biases.get_users().size() > 1 || weights_or_biases.type() != data::type_id())
1816             return;
1817     }
1818
1819     node.set_depthwise_sep_opt(true);
1820 }
1821 template void program_impl::optimize_depthwise_sep_pre<convolution_node>(convolution_node& node);
1822 template void program_impl::optimize_depthwise_sep_pre<deconvolution_node>(deconvolution_node& node);
1823
1824 void program_impl::prepare_depthwise_sep_opt()
1825 {
1826     //depthiwise separated convolution/deconvolution optimization
1827     for (auto& p : nodes_map)
1828     {
1829         auto& prim = *p.second;
1830         if (prim.type() == convolution::type_id())
1831         {
1832             optimize_depthwise_sep_pre(prim.as<convolution>());
1833         }
1834         else if (prim.type() == deconvolution::type_id())
1835         {
1836             optimize_depthwise_sep_pre(prim.as<deconvolution>());
1837         }
1838     }
1839 }
1840
1841 void program_impl::handle_reshape()
1842 {
1843     //reshape primitive by definition does not change underlying data, only shape description
1844     //however during graph initialization and data optimization the layouts can be changed without user's knowledge,
1845     //when reshape is followed by reorder, it is likely that reorder's output will not be as expected (for example reshape with flattened shape)
1846     //this pass resolved the issue by changing graph in the following way
1847     //- in case reshape has multiple users with reshape->reorder sequence, it will be splitted to multiple reshape primitives with single user
1848     //- in case of reshape->reorder sequence, the additional reorder before reshape will be added,
1849     //  if last reorder does not contain padding or mean subtract, it will be removed later in the graph
1850
1851     for (const auto& node : processing_order)
1852     {
1853         if (node->is_type<reshape>())
1854         {
1855             auto& input_node = node->get_dependency(0);
1856
1857             if (input_node.is_type<reorder>())
1858                 continue;
1859
1860             //vector for storing nodes that are reorder type, for which splitted primitives are needed (except for the first one where orginal reshape will be used)
1861             std::vector<program_node*> reorder_node_to_split;
1862
1863             //find the users of reshape that are reorder type, if none present then skip the current node
1864             for (const auto& user : node->get_users())
1865             {
1866                 if (user->is_type<reorder>())
1867                     reorder_node_to_split.push_back(user);
1868             }
1869
1870             if (!reorder_node_to_split.empty())
1871             {
1872                 auto& prim_node = node->as<reshape>();
1873                 const auto& prim = prim_node.get_primitive();
1874                 auto output_shape = prim->output_shape;
1875
1876                 //vector for storing reshape nodes to connect to new reorder nodes (if needed)
1877                 std::vector<program_node*> reorder_reshape_nodes;
1878
1879                 bool skip_first_user = false;
1880                 auto reshape_users = node->get_users();
1881                 for (const auto& user : reshape_users)
1882                 {
1883                     //reshape node for first user will be the orginal reshape from the graph
1884                     if (!skip_first_user)
1885                     {
1886                         if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
1887                             reorder_reshape_nodes.push_back(node);
1888                         skip_first_user = true;
1889                         continue;
1890                     }
1891
1892                     //other reshapes will be clones of the orginal one connected to reshape->reorder sequences
1893                     if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
1894                     {
1895                         auto new_reshape = std::make_shared<reshape>("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape);
1896                         auto& new_reshape_node = get_or_create(new_reshape);
1897                         add_connection(input_node, new_reshape_node);
1898                         user->replace_dependency(0, new_reshape_node);
1899                         new_reshape_node.processing_itr = processing_order.insert(std::next(input_node.processing_itr), &new_reshape_node);
1900                         reorder_reshape_nodes.push_back(&new_reshape_node);
1901                     }
1902                 }
1903
1904                 //add new reorder nodes to proper reshape node
1905                 auto reshape_reorder_id = 0;
1906                 for (const auto& reorder_node : reorder_node_to_split)
1907                 {
1908                     auto& reorder_reshape_node = reorder_reshape_nodes[reshape_reorder_id];
1909                     auto reshape_in_layout = reorder_node->get_output_layout();
1910                     auto reshape_input = std::make_shared<reorder>("_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(), input_node.id(), reshape_in_layout.format, reshape_in_layout.data_type);
1911                     auto& reshape_input_node = get_or_create(reshape_input);
1912                     add_intermediate(reshape_input_node, *reorder_reshape_node, 0, reshape_input_node.dependencies.empty());
1913                     reshape_reorder_id++;
1914                 }
1915             }
1916
1917             auto reshape_layout = node->get_output_layout();
1918             if (!(node->is_output()) && (reshape_layout.format != cldnn::format::bfyx))
1919             {
1920                 auto bfyx_layout = layout({ reshape_layout.data_type, cldnn::format::bfyx, reshape_layout.size });
1921                 //when some primitive does an implicit reorder to some other format then we lose the info about pitches in reshape stage
1922                 //we assume user provides the input vector in bfyx
1923                 if (!are_layouts_identical(reshape_layout, bfyx_layout).second)
1924                 {
1925                     auto reshape_input = std::make_shared<reorder>("_reshape_input_" + node->id(), input_node.id(), cldnn::format::bfyx, reshape_layout.data_type);
1926                     auto& reshape_input_node = get_or_create(reshape_input);
1927                     add_intermediate(reshape_input_node, *node, 0, reshape_input_node.dependencies.empty());
1928
1929                     auto reshape_users = node->get_users();
1930                     for (const auto& user : reshape_users)
1931                     {
1932                         size_t idx = 0;
1933                         for (size_t i = 0; i < user->get_dependencies().size(); i++)
1934                         {
1935                             auto& input = user->get_dependency(i);
1936                             if (input.id() == node->id()) {
1937                                 idx = i;
1938                                 break;
1939                             }
1940                         }
1941                         auto reshape_output = std::make_shared<reorder>("_reshape_output_" + node->id(), user->id(), reshape_layout.format, reshape_layout.data_type);
1942                         auto& reshape_output_node = get_or_create(reshape_output);
1943                         add_intermediate(reshape_output_node, *user, idx, reshape_output_node.dependencies.empty());
1944                     }
1945                 }
1946             }
1947         }
1948     }
1949 }
1950
1951 //function which prepares given primitive for weights optimization
1952 template <typename T>
1953 void program_impl::optimize_weights(T& node, layout_optimizer& lo)
1954 {
1955     auto weights_offset = node.get_primitive()->input.size();
1956     auto bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size();
1957     for (auto i = weights_offset; i < bias_offset; i++)
1958     {
1959         auto& weights = node.get_dependency(i);
1960         auto* impl = node.get_selected_impl().get();
1961         auto output_layout = node.get_output_layout();
1962         auto& weights_node = node.get_dependency(1);
1963         auto weights_layout = weights_node.get_output_layout();
1964         const auto weights_type = layout_optimizer::data_type::weights;
1965
1966         auto reorders = lo.get_generic_layer(
1967                 impl->_weights_reorder_params,
1968                 weights.id(),
1969                 weights_layout,
1970                 weights_type);
1971
1972         for (auto& reorder : reorders)
1973         {
1974             //insert new generic_layer node to topology
1975             this->add_intermediate(reorder.first, node, i, !reorder.second);
1976             //set generic_layer's node output layout and implementation
1977             auto& g_node = node.get_dependency(i);
1978             g_node.get_output_layout(false);
1979             g_node.selected_impl = g_node.type()->choose_impl(*engine, g_node);
1980         }
1981         //set the old output layout and do not invalidate users as change of weights will not affect output layout
1982         node.set_output_layout(output_layout, false);
1983     }
1984 }
1985 template void program_impl::optimize_weights<convolution_node>(convolution_node& node, layout_optimizer& lo);
1986 template void program_impl::optimize_weights<deconvolution_node>(deconvolution_node& node, layout_optimizer& lo);
1987 template void program_impl::optimize_weights<fully_connected_node>(fully_connected_node& node, layout_optimizer& lo);
1988
1989 void program_impl::post_optimize_weights(layout_optimizer& lo)
1990 {
1991     for (auto& p : nodes_map)
1992     {
1993         auto& prim = *p.second;
1994         if (prim.type() == convolution::type_id())
1995         {
1996             optimize_weights(prim.as<convolution>(), lo);
1997         }
1998         else if (prim.type() == deconvolution::type_id())
1999         {
2000             optimize_weights(prim.as<deconvolution>(), lo);
2001         }
2002         else if (prim.type() == fully_connected::type_id())
2003         {
2004             optimize_weights(prim.as<fully_connected>(), lo);
2005         }
2006         //else if (prim.type() == lstm_gemm::type_id())//TODO: Enable postoptimize weights for lstm
2007         //{
2008         //    prep_opt(prim.as<lstm_gemm>()); //we should take care of weights and reccurent
2009         //}
2010     }
2011 }
2012
2013 template <typename T>
2014 void program_impl::optimize_depthwise_sep_post(T& node)
2015 {
2016     if (!node.get_depthwise_sep_opt())
2017         return;
2018
2019     const auto& split = node.get_primitive()->split();
2020
2021     auto dependency_offset = node.get_primitive()->input.size();
2022     //concatenate weights
2023     {
2024         //if weights were optimized it is needed to use the sizes after optimization
2025         auto target_layout = get_weights_layout(node.get_dependency(dependency_offset), split);
2026         merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2027         dependency_offset++;
2028     }
2029
2030     //concatenate biases
2031     if (node.get_primitive()->bias.size() != 0)
2032     {
2033         const auto& bias_layout = node.get_dependency(dependency_offset).get_output_layout();
2034         auto target_layout = layout(bias_layout.data_type, cldnn::format::bfyx, { 1, 1, bias_layout.size.spatial[0] * split, 1 });
2035         merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2036         dependency_offset++;
2037     }
2038
2039     if (node.template is_type<convolution>())
2040     {
2041         auto& prim_node = node.template as<convolution>();
2042         const auto& prim = prim_node.get_primitive();
2043
2044         // concatenate weights quantization factors
2045         if (prim->weights_quantization_factors.size() != 0)
2046         {
2047             const auto& weights_quantization_layout = node.get_dependency(dependency_offset).get_output_layout();
2048             auto target_layout = layout(weights_quantization_layout.data_type, cldnn::format::bfyx, { 1, 1, weights_quantization_layout.size.batch[0] * split, 1 });
2049             merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2050             dependency_offset++;
2051         }
2052         // concatenate output callibration factors
2053         if (prim->output_calibration_factors.size() != 0)
2054         {
2055             const auto& output_callibration_layout = node.get_dependency(dependency_offset).get_output_layout();
2056             auto target_layout = layout(output_callibration_layout.data_type, cldnn::format::bfyx, { 1, 1, output_callibration_layout.size.batch[0] * split, 1 });
2057             merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
2058             dependency_offset++;
2059         }
2060     }
2061
2062     if (node.get_primitive())
2063         //override node split, as only one kernel will be executed
2064         node.set_split(1);
2065 }
2066 template void program_impl::optimize_depthwise_sep_post<convolution_node>(convolution_node& node);
2067 template void program_impl::optimize_depthwise_sep_post<deconvolution_node>(deconvolution_node& node);
2068
2069 void program_impl::prep_opt_depthwise_sep_post()
2070 {
2071     //depthiwise separated convolution/deconvolution optimization
2072     for (auto& p : nodes_map)
2073     {
2074         auto& prim = *p.second;
2075         if (prim.type() == convolution::type_id())
2076         {
2077             optimize_depthwise_sep_post(prim.as<convolution>());
2078         }
2079         else if (prim.type() == deconvolution::type_id())
2080         {
2081             optimize_depthwise_sep_post(prim.as<deconvolution>());
2082         }
2083     }
2084 }
2085
2086 void program_impl::apply_needed_padding(program_node& node, program_node& prev_node,
2087     const padding& needed_padding)
2088 {
2089     auto target_layout = prev_node.get_output_layout();
2090
2091     // Short circuit if padding did not change.
2092     if (target_layout.data_padding == needed_padding)
2093         return;
2094
2095     // Special handling for input nodes.
2096     if (prev_node.is_type<input_layout>() || prev_node.is_type<mutable_data>())
2097     {
2098         target_layout.data_padding = needed_padding;
2099
2100         auto r_prim = std::make_shared<reorder>("reorder_input_" + node.id(), prev_node.id(), target_layout);
2101         add_intermediate(r_prim, node, 0);
2102         return;
2103     }
2104
2105     prev_node.merge_output_padding(needed_padding);
2106 }
2107
2108 void program_impl::prepare_padding()
2109 {
2110     if (output_size_handling_enabled)
2111     {
2112         // Prepare upper padding for primitives that support output_size parameter.
2113         for (const auto& node : processing_order)
2114         {
2115             if (node->is_type<convolution>())
2116             {
2117                 auto& prim_node = node->as<convolution>();
2118                 const auto& prim = prim_node.get_primitive();
2119
2120                 if (!prim->with_output_size)
2121                     continue;
2122
2123                 auto filter_size = prim_node.weights(0).get_output_layout().size;
2124
2125                 auto needed_padding = calc_sliding_window_needed_input_padding(
2126                     prim_node.input().get_output_layout(),
2127                     prim->output_size, filter_size, prim->input_offset, prim->stride, prim->dilation, false, 1);
2128                 apply_needed_padding(prim_node, prim_node.input(), needed_padding);
2129             }
2130             else if (node->is_type<deconvolution>())
2131             {
2132                 auto& prim_node = node->as<deconvolution>();
2133                 const auto& prim = prim_node.get_primitive();
2134
2135                 if (!prim->with_output_size)
2136                     continue;
2137
2138                 auto filter_size = prim_node.weights(0).get_output_layout().size;
2139
2140                 auto needed_padding = calc_sliding_window_needed_input_padding(
2141                     prim_node.input().get_output_layout(),
2142                     prim->output_size, filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
2143
2144                 apply_needed_padding(prim_node, prim_node.input(), needed_padding);
2145             }
2146             else if (node->is_type<pooling>())
2147             {
2148                 auto& prim_node = node->as<pooling>();
2149                 const auto& prim = prim_node.get_primitive();
2150
2151                 if (!prim->with_output_size)
2152                     continue;
2153
2154                 // NOTE: Currently there is no pooling implementation/pooling mode which does not check input data range.
2155                 // There is no need to add padding requirements on pooling inputs.
2156                 //auto needed_padding = calc_sliding_window_needed_input_padding(
2157                 //    prim_node.input().get_output_layout(),
2158                 //    prim->output_size, prim->size, prim->input_offset, prim->stride, {1, 1, 1, 1}, false, 1);
2159                 auto needed_padding = prim_node.input().get_output_layout().data_padding;
2160
2161                 apply_needed_padding(prim_node, prim_node.input(), needed_padding);
2162             }
2163         }
2164     }
2165
2166     // Prepare optimized padding for bfyx convolution.
2167     for (auto& pair : nodes_map)
2168     {
2169         if (pair.second->type() != convolution::type_id())
2170             continue;
2171
2172         auto& node = pair.second->as<convolution>();
2173         if (node.get_dependencies().empty())
2174             continue;
2175
2176         auto conv = node.get_primitive();
2177         auto& conv_input_node = node.get_dependency(0);
2178         auto conv_layout = node.get_output_layout();
2179
2180         // right now output padding optimization is only available for bfyx format and data type = float32
2181         if (conv_layout.format != cldnn::format::bfyx
2182             && conv_layout.format != cldnn::format::bf8_xy16
2183             && conv_layout.format != cldnn::format::byxf_af32
2184             && conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32)
2185         {
2186             continue;
2187         }
2188
2189         // Calculating input padding needed for convolution
2190         auto& filter_node = node.as<convolution>().weights(0);
2191         auto filter_prim = filter_node.get_primitive();
2192
2193         layout filter_layout = filter_node.get_output_layout();
2194
2195         // convolution have only one input primitive
2196         auto prev_prim_output_layout = conv_input_node.get_output_layout();
2197
2198         // Compute initial required paddings for primitive used as input for convolution.
2199         auto input_offset = conv->input_offset;
2200         auto stride = conv->stride;
2201         auto dilation = conv->dilation;
2202
2203         auto input_limit_x = input_offset.spatial[0] + (conv_layout.size.spatial[0] - 1) * stride.spatial[0] + (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
2204         auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] + (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
2205
2206         auto left_padding = std::max(-input_offset.spatial[0], 0);
2207         auto top_padding = std::max(-input_offset.spatial[1], 0);
2208         auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
2209         auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
2210
2211         // Adjust right padding, so entire buffer size in X dimension is properly aligned.
2212         // TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
2213         //auto needed_buffer_size_x = static_cast<cldnn::tensor::value_type>(
2214         //    round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
2215         //right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
2216
2217         cldnn::padding needed_padding({ 0, 0, left_padding, top_padding }, { 0, 0, right_padding, bottom_padding }, 0);
2218         needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
2219
2220         apply_needed_padding(node, conv_input_node, needed_padding);
2221     }
2222 }
2223
2224 void program_impl::propagate_constants()
2225 {
2226     constants_propagator prop(this);
2227
2228     for (auto& node : processing_order)
2229         prop.visit_node(*node);
2230
2231     auto&& to_replace = prop.calculate();
2232
2233     //remove all nodes which are no longer relevant, i.e. nodes which:
2234     // 1. are constants, and
2235     // 2. do not have non-const user (so their data are not used during inference), and
2236     // 3. are not marked as outputs.
2237     // in case if node has either non-const user or is marked as output, it should be replace with cldnn::data rather than removed (see next loop)
2238     auto proc_itr = processing_order.begin();
2239     while (proc_itr != processing_order.end())
2240     {
2241         auto& node = (*proc_itr++);
2242         if (!node->is_constant())
2243             continue;
2244         if (node->has_non_const_user() || (node->is_output() && !node->is_type<data>()))
2245             continue;
2246
2247         auto& users = node->users;
2248         auto& deps = node->dependencies;
2249
2250         for (size_t idx = 0; idx < deps.size(); idx++)
2251         {
2252             deps.at(idx)->users.remove(node);
2253         }
2254         deps.clear();
2255
2256         for (auto& usr : users) {
2257             auto& usr_deps = usr->dependencies;
2258             usr_deps.erase(std::remove(usr_deps.begin(), usr_deps.end(), node), usr_deps.end());
2259         }
2260         users.clear();
2261
2262         if (!node->is_output())
2263         {
2264             auto rem = remove_if_dangling(*node);
2265             assert(rem && "Non-output constant node which has only constant users should have been removed during constants propagation pass");
2266             (void)rem;
2267         }
2268     }
2269
2270     //replace all constant nodes which are relevant for inference (either used by non-const user or marked as output) with recomputed cldnn::data
2271     for (auto& cout : to_replace)
2272     {
2273         auto& id_to_replace = cout.first;
2274
2275         //TODO: do not use API primitives internally and get rid of this last 'cldnn::memory' internal usage
2276         memory api_memory = details::memory_c_to_cpp_converter::convert(api_cast(cout.second.get()));
2277         //c-cpp converter does not retain since normally it is done inside API-impl layer (cldnn.cpp) so we need to do it manually
2278         cout.second->add_ref();
2279
2280         auto const_data = std::make_shared<data>("_cldnn_const_prop_" + id_to_replace, api_memory /* <<< REMOVE ME WHEN POSSIBLE */);
2281         auto& new_node = get_or_create(const_data);
2282         auto& curr_node = *nodes_map.at(id_to_replace);
2283
2284         if (!curr_node.is_type<generic_layer>())
2285         {
2286             auto curr_node_deps = curr_node.get_dependencies();
2287             for (auto& dep : curr_node_deps)
2288             {
2289                 auto dep_users = dep->get_users();
2290                 for (auto& dep_user : dep_users)
2291                 {
2292                     if (dep_user == &curr_node)
2293                         remove_connection(*dep, curr_node);
2294                 }
2295             }
2296         }
2297
2298         curr_node.dependencies.clear();
2299         //remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any dependencies)
2300         curr_node.users.erase(
2301             std::remove_if(curr_node.users.begin(), curr_node.users.end(), [](program_node* node) { return node->is_constant(); }),
2302             curr_node.users.end()
2303         );
2304         replace(curr_node, new_node, false, false);
2305     }
2306 }
2307
2308 void program_impl::prepare_buffer_fusing()
2309 {
2310     bool is_debug = options.get<build_option_type::debug>()->enabled();
2311     auto itr = processing_order.begin();
2312     while (itr != processing_order.end())
2313     {
2314         auto& node = (*itr++);
2315
2316         // TODO: Move fused activation to previous layer when possible
2317         if (node->fused_activation.activation_func != cldnn_activation_func_t::activation_none)
2318             continue;
2319
2320         do_for_types<concatenation>(*node, [this, is_debug](concatenation_node& node)
2321         {
2322             // buffer fusing should not be performed if one of inputs produces padded output since
2323             // it could break desired memory alignment. On the other hand, if this node uses all inputs
2324             // exclusively (see check above) they should not have output padding set since concatenation
2325             // does not ask for any.
2326             if (node.has_padded_dependency())
2327                 return;
2328
2329             auto concat_axis = node.get_primitive()->axis;
2330             auto padd = node.get_output_layout().data_padding;
2331
2332             tensor lower_padd = padd.lower_size();
2333             tensor upper_padd = padd.upper_size();
2334
2335             auto upper_padd_val = node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis];
2336             tensor lower_padd_offset = lower_padd;
2337
2338             std::list<std::pair<const std::vector<program_node*>, tensor>> stack = { std::make_pair(node.get_dependencies(), tensor{ 0, 0, 0, 0 }) };
2339             while (!stack.empty())
2340             {
2341                 auto nodes_list = stack.front();
2342                 stack.pop_front();
2343
2344                 auto cascade_adjustment = nodes_list.second;
2345                 upper_padd.raw[concat_axis] = upper_padd_val;
2346                 lower_padd = lower_padd_offset;
2347
2348                 //check if concatenation in place can be applied for inputs set
2349                 for (auto input : nodes_list.first)
2350                 {
2351                     //if any of this node's inputs is used by more than one primitive and is not optimized concatenation then do not fuse buffers,
2352                     //also, if an input is marked as network output, prevent optimizations which would affect a form of its output (unless debug flag is set)
2353                     // todo: in future, if this case is problem, it can be optimized further to enable buffer fusing
2354                     //       per single input rather than all/none
2355                     // + restrict input types to pooling, convolution and activation only due to problems with output padding on b and f
2356                     if ((!input->is_type<pooling>() && !input->is_type<convolution>() && !input->is_type<activation>() && !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>()) ||
2357                         (input->is_output() && !is_debug) ||
2358                         input->get_users().size() > 2)
2359                         return;
2360
2361                     if (input->get_users().size() > 1)
2362                     {
2363                         auto user_count = input->get_users().size();
2364                         for (auto& user : input->get_users())
2365                             if (user->is_type<concatenation>())
2366                                 user_count--;
2367                         if (user_count > 1)
2368                             return;
2369                     }
2370
2371                     //check only for spatial paddings. Accept feature and batch
2372                     if (input->get_output_layout().data_padding.lower_size().spatial[0] != 0 ||
2373                         input->get_output_layout().data_padding.upper_size().spatial[0] != 0 ||
2374                         input->get_output_layout().data_padding.lower_size().spatial[1] != 0 ||
2375                         input->get_output_layout().data_padding.upper_size().spatial[1] != 0)
2376                         return;
2377                 }
2378
2379                 //apply concatenation in place optimization
2380                 for (auto input : nodes_list.first)
2381                 {
2382                     auto input_lenght = input->get_output_layout().size.raw[concat_axis];
2383
2384                     // shrink upper pad so it points at the end of the input's buffer
2385                     //
2386                     //   |--- lower padd ---|                    |---------- upper padd -----------|
2387                     //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
2388                     upper_padd.raw[concat_axis] -= input_lenght;
2389
2390                     //adjust padding sizes for cascade concatenations
2391                     auto lower_padd_tmp = lower_padd;
2392                     lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis];
2393                     auto upper_padd_tmp = upper_padd;
2394                     upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis];
2395
2396                     // set new padding for input
2397                     input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes()));
2398
2399                     // move lower padd further
2400                     //
2401                     //   |-------------- lower padd -------------|---------- upper padd -----------|
2402                     //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
2403
2404                     lower_padd.raw[concat_axis] += input_lenght;
2405
2406                     if (input->type() == concatenation::type_id() && input->can_be_optimized())
2407                     {
2408                         if (input->as<concatenation>().get_primitive()->axis != node.get_primitive()->axis)
2409                             return;
2410
2411                         if (!input->get_dependencies().empty())
2412                             stack.push_back(std::make_pair(input->get_dependencies(), input->get_output_layout().data_padding.lower_size()));
2413                     }
2414                 }
2415             }
2416
2417             node.can_be_optimized(true);
2418         });
2419
2420         // zero copy
2421         do_for_types<crop>(*node, [this, is_debug](crop_node& node)
2422         {
2423             //if the node is marked as network output, prevent optimizations which would affect a form of its output, unless debug flag is set
2424             if (node.is_output() && !is_debug)
2425                 return;
2426
2427             //do not optimize when next node is concatenation which is not output
2428             if (node.get_users().size() == 1 && node.get_users().front()->is_type<concatenation>() && !node.get_users().front()->is_output())
2429                 return;
2430
2431             if (node.get_dependencies().size() == 1 &&
2432                 node.get_users().size() > 0)
2433             {
2434                 // optimization is avaiable for croping across depth(features) only
2435                 // if output padding has defined padding accross featuers already it wouldn't
2436                 // work because it expect to have zeros in the padded area.
2437                 auto format = node.get_output_layout().format;
2438                 auto crop_prim = node.get_primitive();
2439                 auto input_layout = node.get_dependency(0).get_output_layout();
2440                 auto out_padd = node.get_output_layout().data_padding;
2441                 if (format == format::bfyx &&
2442                     crop_prim->reference_input.batch[0] == input_layout.size.batch[0] &&
2443                     crop_prim->reference_input.spatial[0] == input_layout.size.spatial[0] &&
2444                     crop_prim->reference_input.spatial[1] == input_layout.size.spatial[1] &&
2445                     out_padd.lower_size().feature[0] == 0 &&
2446                     out_padd.upper_size().feature[0] == 0 &&
2447                     out_padd.lower_size().batch[0] == 0 &&
2448                     out_padd.upper_size().batch[0] == 0 &&
2449                     out_padd.lower_size().spatial[0] == 0 &&
2450                     out_padd.lower_size().spatial[1] == 0 &&
2451                     out_padd.upper_size().spatial[0] == 0 &&
2452                     out_padd.upper_size().spatial[1] == 0)
2453                 {
2454                     //  Regular crop
2455                     //  crop input buffer
2456                     //  |___________data____________|
2457                     //
2458                     //  crop output buffer
2459                     //  |-------->| offsets[f]  |<--|
2460                     //            |_____data____|
2461                     //             <------------>
2462                     //           reference size
2463                     //
2464                     //  Inplace crop
2465                     //  crop output buffer
2466                     //  |_low_pad_|__data_size__|___|<-upper pad
2467
2468                     node.set_output_padding(padding(
2469                     { out_padd.lower_size().batch[0], crop_prim->offsets.feature[0], out_padd.lower_size().spatial[0], out_padd.lower_size().spatial[1] },
2470                     { out_padd.upper_size().batch[0], input_layout.size.feature[0] - crop_prim->offsets.feature[0] - crop_prim->reference_input.feature[0],
2471                         out_padd.upper_size().spatial[0], out_padd.upper_size().spatial[1] }));
2472                     node.can_be_optimized(true);
2473                 }
2474             }
2475         });
2476
2477         do_for_types<reshape>(*node, [this](reshape_node& node)
2478         {
2479             node.get_output_layout();
2480             if (node.is_in_place() && node.get_fused_activation_func() == activation_none)
2481                 node.can_be_optimized(true);
2482         });
2483         do_for_types<reorder>(*node, [this](reorder_node& node)
2484         {
2485             auto& input = node.input();
2486             auto output_layout = node.get_output_layout();
2487             //This is WA for topologies that due to additional reorders added perform worse with conv1x1 optimization
2488             auto remove_bf8_xy_opt = ((input.is_type<pooling>() || input.is_type<concatenation>()) &&
2489                 output_layout.format == format::bf8_xy16 && input.get_users().size() == 1);
2490             //Remove reorder from convolution 1x1 to bfyx in some conditions
2491             auto remove_byxf_opt = (input.is_type<convolution>() &&
2492                 input.get_users().size() == 1 &&
2493                 input.get_output_layout().format == format::byxf);
2494             //check if all inputs user have the same format
2495             auto all_users_same_format = true;
2496             auto input_user_layout_format = input.get_users().front()->get_output_layout().format;
2497             for (auto const& user : input.get_users())
2498             {
2499                 if (user->get_output_layout().format != input_user_layout_format)
2500                 {
2501                     all_users_same_format = false;
2502                     break;
2503                 }
2504             }
2505             auto same_data_type = input.get_output_layout().data_type == output_layout.data_type;
2506             //Optimization only available in case of layers that support different input and output formats.
2507             //todo: new api needs to be created to read such caps
2508             if (!(input.is_type<pooling>() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) &&
2509                 !remove_bf8_xy_opt &&
2510                 !(input.is_type<convolution>() && input.get_output_layout().format == format::bf8_xy16) &&
2511                 !(input.is_type<eltwise>() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) &&
2512                 !(remove_byxf_opt && (node.get_users().front()->is_type<eltwise>() || node.get_users().front()->is_type<pooling>())))
2513                 return;
2514
2515             if (remove_bf8_xy_opt)
2516             {
2517                 auto users_user_layout = node.get_users().front()->get_users().front()->get_output_layout();
2518                 auto input_layout = input.get_output_layout();
2519                 auto target_layout = layout(input_layout.data_type, users_user_layout.format, input_layout.size, input_layout.data_padding);
2520                 input.set_output_layout(target_layout, false);
2521             }
2522             else if (remove_byxf_opt)
2523             {
2524                 auto user = node.get_users().front();
2525                 auto users_users = node.get_users().front()->get_users();
2526
2527                 for (auto const& users_user : users_users)
2528                 {
2529                     if (users_user->get_output_layout().format != format::byxf && !users_user->is_type<eltwise>())
2530                     {
2531                         remove_byxf_opt = false;
2532                         break;
2533                     }
2534                 }
2535
2536                 if (remove_byxf_opt)
2537                 {
2538                     auto input_layout = input.get_output_layout();
2539                     user->set_output_layout(input_layout, false);
2540                 }
2541             }
2542             else
2543                 input.set_output_layout(output_layout, false);
2544
2545             node.can_be_optimized(true);
2546             extract_and_remove(node); //try to remove redundant reorders
2547         });
2548     }
2549 }
2550
2551 void program_impl::fuse_skip_layers(program_node* node)
2552 {
2553     do_for_types<eltwise>(*node, [this](eltwise_node& node)
2554     {
2555         bool skippable = false;
2556         int index = 0;
2557         if (node.get_primitive()->mode != eltwise_mode::sum || node.inputs_count() != 2)
2558             return;
2559
2560         if (node.input(0).is_type<deconvolution>())
2561         {
2562             skippable = true;
2563         }
2564         else if (node.input(1).is_type<deconvolution>())
2565         {
2566             skippable = true;
2567             index = 1;
2568         }
2569
2570         if (!skippable)
2571             return;
2572
2573         auto& to_fuse_with = node.input(index).as<deconvolution>();
2574         int to_fuse_index = index == 0 ? 1 : 0;
2575
2576         // check that node doesn't have fused eltwise already
2577         if (to_fuse_with.has_fused_sum())
2578             return;
2579
2580         //remove dependencies and users of elwtise that is going to be extracted
2581         add_connection(node.input(to_fuse_index), to_fuse_with);
2582         remove_connection(node.input(to_fuse_index), node);
2583
2584         //replace processing_num of the node where fusing take place and eltwise
2585         auto new_processing_num = node.processing_num;
2586         processing_order.erase(to_fuse_with.processing_itr);
2587         to_fuse_with.processing_itr = processing_order.insert(node.processing_itr, &to_fuse_with);
2588         to_fuse_with.processing_num = new_processing_num;
2589
2590         //make sure that new fused node's users have higher processing_num than fused node
2591         for (auto user : to_fuse_with.get_users())
2592         {
2593             if (user->processing_num < new_processing_num)
2594             {
2595                 processing_order.erase(user->processing_itr);
2596                 user->processing_itr = processing_order.insert(std::next(to_fuse_with.processing_itr), user);
2597                 user->processing_num = new_processing_num + 1;
2598             }
2599         }
2600
2601         if (node.get_fused_activation_func() != activation_none)
2602             to_fuse_with.set_fused_activation(node.get_fused_activation_func(), node.get_fused_activation_params());
2603         to_fuse_with.set_output_padding(node.get_output_layout().data_padding);
2604
2605         extract_and_remove(node);
2606     });
2607 }
2608
2609 void program_impl::prepare_primitive_fusing()
2610 {
2611     bool is_debug = options.get<build_option_type::debug>()->enabled();
2612
2613     auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed
2614     while (itr != processing_order.end())
2615     {
2616         auto node_itr = itr++;
2617         auto& node = (*node_itr);
2618
2619         do_for_types<activation>(*node, [this, is_debug](activation_node& node)
2620         {
2621
2622             auto& input = node.input();
2623
2624             //Restrictions:
2625             // - inputs cannot be padded
2626             // - primitives input cannot be output
2627             // - no activation additional input
2628             // - input was optimized
2629             if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() ||
2630                 node.get_dependencies().size() != 1 || input.can_be_optimized())
2631                 return;
2632
2633             // - check if there is no activation fused already
2634             // - limit to primitives which implementations support activation fusing
2635             if (input.get_users().size() != 1 || input.get_fused_activation_func() != activation_none ||
2636                 //TODO: new api needs to be created to read such caps
2637                 //right now use whitelist so no new primitives will be affected in case of lack of fused activation support
2638                 (!input.is_type<batch_norm>() && !input.is_type<concatenation>() && !input.is_type<convolution>() &&
2639                     !input.is_type<crop>() && !input.is_type<deconvolution>() && !input.is_type<eltwise>() &&
2640                     !input.is_type<fully_connected>() && !input.is_type<lrn>() && !input.is_type<normalize>() &&
2641                     !input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
2642                     !input.is_type<roi_pooling>() && !input.is_type<scale>() &&
2643                     !input.is_type<softmax>() && !input.is_type<upsampling>() && !input.is_type<mvn>()))
2644                 return;
2645
2646             input.set_fused_activation(node.get_primitive()->activation_func, node.get_primitive()->additional_params);
2647             input.set_output_padding(node.get_output_layout().data_padding);
2648
2649             extract_and_remove(node);
2650         });
2651     }
2652
2653     //Second loop tries fusing several reorders one by one (if present) into one reorder
2654     itr = processing_order.begin();
2655     while (itr != processing_order.end())
2656     {
2657         auto node_itr = itr++;
2658         auto& node = (*node_itr);
2659
2660         do_for_types<reorder>(*node, [this, is_debug](reorder_node& node)
2661         {
2662             auto& input = node.input();
2663
2664             //Restrictions:
2665             // - inputs cannot be padded
2666             // - primitives input cannot be output
2667             // - input was optimized
2668             if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.get_dependencies().size() != 1 ||
2669                 input.can_be_optimized())
2670                 return;
2671
2672             // - check if previous node is reorder with 1 user
2673             // - do not fuse if current node has mean subtract
2674             if (input.get_users().size() != 1 || !input.is_type<reorder>() ||
2675                 node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
2676                 return;
2677
2678             input.set_output_layout(node.get_output_layout(), false);
2679             extract_and_remove(node);
2680         });
2681     }
2682     //Third loop tries fusing eltwise (sum) with deconvolution
2683     itr = processing_order.begin();
2684     while (itr != processing_order.end())
2685     {
2686         auto node_itr = itr++;
2687         auto& node = (*node_itr);
2688
2689         fuse_skip_layers(node);
2690     }
2691 }
2692
2693 program_node& program_impl::get_or_create(std::shared_ptr<primitive> prim)
2694 {
2695     auto itr = nodes_map.lower_bound(prim->id);
2696     if (itr != nodes_map.end() && itr->first == prim->id)
2697         return *itr->second;
2698
2699     auto new_node = prim->type->create_node(*this, prim);
2700     new_node->set_org_primitive_id(new_node->id());
2701     nodes_map.insert(itr, { prim->id, new_node });
2702     return *new_node;
2703 }
2704
2705 void program_impl::add_intermediate(program_node& node, program_node& next, size_t prev_idx, bool connect_int_node_with_old_dep)
2706 {
2707     if (connect_int_node_with_old_dep && !node.dependencies.empty())
2708         throw std::invalid_argument("Node which is about to be added inbetween two other nodes should not have any existing dependencies");
2709
2710     auto& prev = next.get_dependency(prev_idx);
2711     //firstly add connection, later replace dependency, so 'prev' won't become dangling and therefore removed
2712     if (connect_int_node_with_old_dep)
2713     {
2714         add_connection(prev, node);
2715         if (node.processing_itr != processing_order.end())
2716             processing_order.erase(node.processing_itr);
2717
2718         auto itr = prev.processing_itr;
2719         node.processing_itr = processing_order.insert(++itr, &node);
2720         node.processing_num = prev.processing_num;
2721     }
2722
2723     next.replace_dependency(prev_idx, node);
2724     node.constant = prev.constant;
2725     node.data_flow = prev.data_flow;
2726     if (prev.constant_frontier)
2727     {
2728         node.constant_frontier = true;
2729         prev.constant_frontier = false;
2730     }
2731 }
2732
2733 void program_impl::rename(program_node & node, primitive_id const & new_id)
2734 {
2735     if (nodes_map.count(new_id))
2736         throw std::runtime_error("Trying to rename program_node but node with id " + new_id + " already exists");
2737     if (node.is_output())
2738         throw std::invalid_argument("Trying to rename an output node. If you intend to do that, please clear 'output' flag manually.");
2739
2740     auto node_ptr = nodes_map.find(node.id())->second;
2741     nodes_map.emplace(new_id, node_ptr);
2742     nodes_map.erase(node.id());
2743
2744     if (!node.is_type<internal_primitive>())
2745         const_cast<primitive_id&>(node.desc->id) = new_id;
2746     else
2747         reinterpret_cast<details::internal_program_node_base&>(node).internal_id = new_id;
2748 }
2749
2750 void program_impl::swap_names(program_node& node1, program_node& node2)
2751 {
2752     const auto _extract_id = [](program_node& node) -> primitive_id&
2753     {
2754         if (!node.is_type<internal_primitive>())
2755             return const_cast<primitive_id&>(node.desc->id);
2756         else
2757             return reinterpret_cast<details::internal_program_node_base&>(node).internal_id;
2758     };
2759
2760     nodes_map.at(node1.id()).swap(nodes_map.at(node2.id()));
2761     std::swap(_extract_id(node1), _extract_id(node2));
2762 }
2763
2764 void program_impl::replace_all_usages(program_node & old_node, program_node & new_node)
2765 {
2766     auto itr = old_node.users.begin();
2767     bool end = (itr == old_node.users.end());
2768     while (!end)
2769     {
2770         auto& usage = (*itr++);
2771         end = (itr == old_node.users.end());
2772         usage->replace_dependency(old_node, new_node);
2773     }
2774 }
2775
2776 void program_impl::replace(program_node& old_node, program_node& new_node, bool replace_whole_branch, bool check_output_layouts_integrity)
2777 {
2778     if ((!new_node.dependencies.empty() && !replace_whole_branch) || !new_node.users.empty())
2779         throw std::invalid_argument("Node which is about to replace other node should be detached");
2780
2781     if (new_node.is_output())
2782         throw std::invalid_argument("Replacement node shouldn't be marked as an output since it's impossible to rename such node.");
2783
2784     auto id = old_node.id();
2785     new_node.output_layout = old_node.get_output_layout();
2786     new_node.valid_output_layout = old_node.valid_output_layout;
2787
2788     if (!replace_whole_branch)
2789     {
2790         //copy old's dependencies
2791         while (!old_node.dependencies.empty())
2792         {
2793             auto& dep = old_node.dependencies.back();
2794             add_connection(*dep, new_node);
2795             remove_connection(*dep, old_node);
2796         }
2797     }
2798
2799     //append users
2800     for (auto& user : old_node.users)
2801     {
2802         new_node.users.push_back(user);
2803         for (auto& users_dep : user->dependencies)
2804         {
2805             if (users_dep == &old_node)
2806             {
2807                 users_dep = &new_node;
2808                 break;
2809             }
2810         }
2811     }
2812
2813     old_node.users.clear();
2814
2815     if (check_output_layouts_integrity && new_node.valid_output_layout)
2816         new_node.recalc_output_layout();
2817
2818     bool old_was_output = false;
2819     //copy node's state
2820     if (old_node.is_output())
2821     {
2822         old_was_output = true;
2823         old_node.set_output(false);
2824         outputs.erase(std::remove(outputs.begin(), outputs.end(), &old_node), outputs.end());
2825     }
2826     if (new_node.is_input())
2827         inputs.push_back(&new_node);
2828     if (old_node.is_input())
2829         inputs.remove(&old_node);
2830
2831     new_node.constant = old_node.constant;
2832     new_node.constant_frontier = old_node.constant_frontier;
2833     new_node.user_mark = old_node.user_mark;
2834
2835     auto old_news_pos = new_node.processing_itr;
2836     new_node.processing_itr = processing_order.insert(old_node.processing_itr, &new_node);
2837     new_node.processing_num = old_node.processing_num;
2838     if (old_news_pos != processing_order.end())
2839         processing_order.erase(old_news_pos);
2840     if (old_node.processing_itr != processing_order.end())
2841         processing_order.erase(old_node.processing_itr);
2842
2843     nodes_map.erase(id);
2844     rename(new_node, id);
2845
2846     //mark new node as an output after renaming
2847     if (old_was_output)
2848     {
2849         new_node.set_output(true);
2850         outputs.push_back(&new_node);
2851     }
2852 }
2853
2854 bool program_impl::remove_if_dangling(program_node& node, bool detach_whole_branch)
2855 {
2856     if (!node.users.empty())
2857         return false;
2858     if (!detach_whole_branch && !node.dependencies.empty())
2859         return false;
2860
2861     std::list<program_node*> to_remove;
2862     std::list<program_node*> marked;
2863     if (detach_whole_branch)
2864     {
2865         node.mark();
2866         std::list<program_node*> queue = { &node };
2867         while (!queue.empty())
2868         {
2869             auto curr = queue.front();
2870             queue.pop_front();
2871             marked.push_back(curr);
2872
2873             //remove only if all users also has been marked
2874             bool rem = !std::any_of(curr->get_users().begin(), curr->get_users().end(), [](program_node* node) { return !node->is_marked(); });
2875             if (rem)
2876                 to_remove.push_back(curr);
2877
2878             for (auto dep : curr->get_dependencies())
2879             {
2880                 if (!dep->is_marked())
2881                 {
2882                     dep->mark();
2883                     queue.push_back(dep);
2884                 }
2885             }
2886         }
2887     }
2888     else
2889         to_remove.push_back(&node);
2890
2891     for (auto n : marked)
2892         n->unmark();
2893
2894     for (auto rem : to_remove)
2895     {
2896         if (!rem->is_output() || is_debug_build())
2897         {
2898             if (detach_whole_branch)
2899             {
2900                 for (auto& user : rem->get_users())
2901                     user->remove_dependency(*rem);
2902             }
2903             if (rem->is_input())
2904                 inputs.remove(rem);
2905
2906             if (std::find(processing_order.begin(), processing_order.end(), rem) != processing_order.end())
2907                 processing_order.erase(rem->processing_itr);
2908             optimized_out.push_back(rem->id());
2909             nodes_map.erase(rem->id());
2910         }
2911     }
2912
2913     return true;
2914 }
2915
2916 bool program_impl::extract_and_remove(program_node& node)
2917 {
2918     if (node.get_dependencies().size() != 1)
2919         return false;
2920
2921     if (node.is_output() && node.get_dependency(0).is_output() && !is_debug_build()) //TODO: add a mechanism to support removal of nodes which are marked as outputs
2922         return false;
2923
2924     if (node.is_output() && !is_debug_build())
2925     {
2926         auto& prev = node.get_dependency(0);
2927         auto node_id = node.id();
2928
2929         node.set_output(false);
2930         outputs.erase(std::remove(outputs.begin(), outputs.end(), &node), outputs.end());
2931
2932         rename(node, "_cldnn_tmp_" + node_id);
2933         rename(prev, node_id);
2934
2935         prev.set_output(true);
2936         outputs.push_back(&prev);
2937     }
2938
2939     auto& input = node.get_dependency(0);
2940     node.dependencies.clear();
2941     input.users.remove(&node);
2942
2943     if (node.constant_frontier)
2944     {
2945         assert(node.constant && "Constant frontier should also, by definition, be constant");
2946         assert(input.constant && "Input for constant forontier should, by definition, be constant");
2947         input.constant_frontier = true;
2948     }
2949
2950     if (!node.is_endpoint())
2951         replace_all_usages(node, input);
2952     else
2953         remove_if_dangling(node);
2954
2955     return true;
2956 }
2957
2958 void program_impl::replace_data_with_optimized(std::map<primitive_id, memory_impl::ptr> const & replace_map)
2959 {
2960     for (auto& result : replace_map)
2961     {
2962         auto& node = *nodes_map.at(result.first);
2963         assert(node.is_type<data>() && "Optimized primitive is not a cldnn::data");
2964         assert(result.second != nullptr && "Memory which handles result of optimization should not be nullptr");
2965         node.as<data>().attach_memory(*result.second, false);
2966     }
2967 }
2968
2969 void program_impl::dump_memory_pool() const
2970 {
2971     if (!get_engine().configuration().enable_memory_pool)
2972         return;
2973     auto path = get_dir_path(options);
2974     if (path.empty())
2975     {
2976         return;
2977     }
2978
2979     path += "cldnn_memory_pool.log";
2980     auto dep = get_memory_dependencies_string();
2981     get_engine().dump_memory_pool(*this, path, dep);
2982     dump_program("14_memory_pool", true);
2983 }
2984
2985 //TODO: break this function into number of smaller ones + add per-primitive fields (possibly use primitive_inst::to_string?)
2986 void program_impl::dump_program(const char* stage, bool with_full_info, std::function<bool(program_node const&)> const& filter) const
2987 {
2988     auto path = get_dir_path(options);
2989     if (path.empty())
2990     {
2991         return;
2992     }
2993
2994     std::ofstream graph(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".graph");
2995     dump_graph_init(graph, *this, filter);
2996
2997     if (!with_full_info)
2998     {
2999         return;
3000     }
3001
3002     graph.open(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".info");
3003     dump_graph_info(graph, *this, filter);
3004
3005     graph.open(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".order");
3006     dump_graph_processing_order(graph, *this);
3007
3008     graph.open(path + "cldnn_program_" + std::to_string(prog_id) + "_" + stage + ".optimized");
3009     dump_graph_optimized(graph, *this);
3010 }
3011
3012 //Dumps weights and biasses in serialization process, not working yet, in progress.
3013 void program_impl::dump_weights_and_biasses(std::vector<unsigned long long>& offsets, std::vector<std::string>& data_names, std::ofstream& file_stream) const
3014 {
3015     for (auto const& n : nodes_map)
3016     {
3017         auto dependency_count = (unsigned int)n.second.get()->get_dependencies().size();
3018         for (unsigned int dp = 0; dp < dependency_count; dp++)
3019         {
3020             auto& dependency = n.second.get()->get_dependency(dp);
3021             if (dependency.is_type<data>())
3022             {
3023                 offsets.push_back(offsets.empty() ? 0ull : offsets.back());
3024                 auto& mem = dependency.as<data>().get_attached_memory();
3025                 if (mem.get_layout().data_type == data_types::f32)
3026                     dump_data(mem, file_stream, offsets.back(), sizeof(float));
3027                 else
3028                     dump_data(mem, file_stream, offsets.back(), sizeof(short));
3029                 data_names.push_back(dependency.as<data>().id());
3030             }
3031         }
3032     }
3033     file_stream.close();
3034 }
3035
3036 //Makes serialization with given name.
3037 //Placeholder, not working yet, in progress.
3038 void program_impl::serialize(std::string network_name, std::function<bool(program_node const&)> const& filter) const
3039 {
3040     std::vector<unsigned long long> offsets;
3041     std::vector<std::string> data_names;
3042
3043     std::ofstream file_stream(network_name + "_" + "serialization" + ".bin", std::ios::binary);
3044     dump_kernels(engine->get_context().get()->get_kernels_cache().get_context().get_binaries(), offsets, data_names, file_stream);
3045     dump_weights_and_biasses(offsets, data_names, file_stream);
3046
3047     std::ofstream graph(network_name + "_" + "serialization" + ".xml");
3048     dump_to_xml(graph, *this, filter, offsets, data_names);
3049 }