///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "program_impl.h"
-#include "primitive_inst.h"
-#include "layout_optimizer.h"
-#include "constants_propagator.h"
-
-#include "primitive_type.h"
-#include "api/CPP/activation.hpp"
-#include "api/CPP/eltwise.hpp"
-#include "api/CPP/input_layout.hpp"
-#include "api/CPP/pooling.hpp"
-#include "api/CPP/proposal.hpp"
-#include "api/CPP/roi_pooling.hpp"
-#include "api/CPP/reorg_yolo.hpp"
-
-#include "activation_inst.h"
-#include "batch_norm_inst.h"
+#include "error_handler.h"
+#include "kernel_selector_helper.h"
#include "internal_primitive.h"
#include "internal_primitive_type_base.h"
+#include "layout_optimizer.h"
+#include "pass_manager.h"
+#include "primitive_type.h"
+#include "program_dump_graph.h"
+#include "program_helpers.h"
+#include "program_impl.h"
+#include "sliding_window_utils.h"
+
#include "convolution_inst.h"
#include "concatenation_inst.h"
#include "crop_inst.h"
#include "data_inst.h"
-#include "mutable_data_inst.h"
#include "deconvolution_inst.h"
#include "detection_output_inst.h"
-#include "lrn_inst.h"
-#include "normalize_inst.h"
-#include "permute_inst.h"
+#include "input_layout_inst.h"
+#include "lstm_inst.h"
+#include "lstm_elt_inst.h"
+#include "lstm_gemm_inst.h"
+#include "mutable_data_inst.h"
+#include "pooling_inst.h"
+#include "primitive_inst.h"
#include "prior_box_inst.h"
+#include "proposal_inst.h"
#include "reorder_inst.h"
#include "reshape_inst.h"
-#include "scale_inst.h"
-#include "embed_inst.h"
-#include "softmax_inst.h"
#include "split_inst.h"
-#include "program_dump_graph.h"
-#include "upsampling_inst.h"
-#include "eltwise_inst.h"
-#include "fully_connected_inst.h"
-#include "mvn_inst.h"
-#include "lstm_inst.h"
-#include "lstm_gemm_inst.h"
-#include "lstm_elt_inst.h"
-#include "embed_inst.h"
-#include "network_impl.h"
-#include "kernel_selector_helper.h"
-#include "sliding_window_utils.h"
-#include "error_handler.h"
+#include "gpu/ocl_toolkit.h"
#include <fstream>
#include <algorithm>
#include <iostream>
#include <sstream>
#include <iomanip>
+#include <memory>
+program_impl::program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal, bool no_optimizations)
+ : engine(&engine_ref), options(options), processing_order(* new nodes_ordering), pm(std::unique_ptr<pass_manager>(new pass_manager()))
+{
+ set_options();
+ prepare_nodes(topology);
+ if (no_optimizations)
+ init_graph();
+ else
+ build_program(is_internal);
+}
-namespace {
-
- //helper function for selecting function basing on the type of the given primitive
- //this is the termination case for parameter pack recurrence, see overload below for logic
- template <class... T>
- void do_for_types(program_node&)
- {
- return;
- }
-
- //helper function for selecting function basing on the type of the given primitive
- //this function should be explicitly given set of types and implicitly set of functions.
- //both sets should have equal size. First function will be called if type of the given primitive
- //will match first explicitly given type, second will be called if it matches second explicitly given
- //type etc.
- //Functions given as arguments should themselves take std::shared_ptr<const T> as argument
- //where T is the type that should be match if this function should be called
- //
- //example:
- // do_for_types<
- // convolution,
- // pooling
- // >(primitive,
- // [](typed_program_node<convolution>&){ do something if 'primitive' is a convolution },
- // [](typed_program_node<pooling>&) { do something if 'primitive' is a pooling }
- // );
- template <class T, class... RestOfT, class Func, class... RestOfFuncs>
- decltype(static_cast<void>(std::declval<Func>()(std::declval<typed_program_node<T>&>()))) do_for_types(
- program_node& node,
- Func const& func,
- RestOfFuncs const&... rest)
- {
- if (node.type() == T::type_id())
- func(node.as<T>());
- else
- do_for_types<RestOfT...>(node, rest...);
- }
-
- template <class T>
- struct single_element_container
- {
- single_element_container(T& t) : elem(&t)
- {}
-
- constexpr size_t size() const { return 1; }
- single_element_container<T> begin() const { return single_element_container(elem); }
- single_element_container<T> end() const { return single_element_container(nullptr); }
- single_element_container<T>& operator ++() { elem = nullptr; return *this; }
- bool operator !=(single_element_container const& sec) { return elem != sec.elem; }
-
- T operator *() { return *elem; }
-
- private:
- single_element_container(T* t) : elem(t)
- {}
-
- T* elem;
- };
-
- //helper function which creates single-element array if it's given anything
- //other than std::vector.
- //It should be used in generic code when there's a need to force vector usage
- //in foreach loop over variable which can in one context be a vector or a scalar
- //in another.
- //example:
- // T t;
- // for (auto& string : wrap_if_single(t.dump()))
- //depending on type T, t.dump() may return either std::string or std::vector<std::string>,
- //to ensure compatibility between these cases, wrap_if_single will create single-element
- //container in case t.dump() would return plain std::string.
- //
- // T& case -> returns container which holds T&
- template <class T>
- single_element_container<T> wrap_if_single(T& t)
- {
- return single_element_container<T>(t);
- }
-
- //helper function which creates single-element array if it's given anything
- //other than std::vector.
- // T const& case -> returns container which holds T const&
- template <class T>
- single_element_container<T const> wrap_if_single(T const& t)
- {
- return single_element_container<T const>(t);
- }
-
- //helper function which creates single-element array if it's given anything
- //other than std::vector.
- // T&& case -> returns container which holds new instance of T created by moving given param
- template <class T>
- single_element_container<T> wrap_if_single(T&& t)
- {
- static_assert(meta::always_false<T>::value, "Wrapping temporary object into single_element_container is an error (requires valid reference)");
- return single_element_container<T>(t);
- }
-
- //helper function which creates single-element array if it's given anything
- //other than std::vector.
- // std::vector case -> does not wrap, returns t as-is
- primitive::fixed_size_vector_ref const& wrap_if_single(primitive::fixed_size_vector_ref const& t)
- {
- return t;
- }
-
- //helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
- void merge_buffers(engine_impl::ptr engine, program_node &node, layout target_layout, size_t begin_offset, size_t end_offset)
- {
- memory_impl::ptr data_to_allocate = engine->allocate_memory(target_layout);
-
- for (size_t i = begin_offset; i < end_offset; i++)
- {
- auto& weights = node.get_dependency(i).as<data>();
- mem_lock<char> src{ weights.get_attached_memory() };
- mem_lock<char> dst{ data_to_allocate };
- std::copy(src.begin(), src.end(), dst.begin() + (i - begin_offset)*src.size());
- }
-
- for (size_t i = 0; i < end_offset - begin_offset - 1; i++)
- node.remove_dependency(begin_offset + 1);
+program_impl::program_impl(engine_impl& engine_ref, std::set<std::shared_ptr<program_node>> const& nodes, build_options const& options, bool is_internal)
+ : engine(&engine_ref), options(options), processing_order(*new nodes_ordering), pm(std::unique_ptr<pass_manager>(new pass_manager()))
+{
+ set_options();
+ prepare_nodes(nodes);
+ build_program(is_internal);
+}
- auto& data_node = node.get_dependency(begin_offset).as<data>();
- data_node.attach_memory(*data_to_allocate, false);
- }
+program_impl::~program_impl() = default;
- //helper function for getting target layout used in depthwise sep optimization
- layout get_weights_layout(typed_program_node<cldnn::data> &data_node, int32_t split)
+program_node& program_impl::get_node(primitive_id const& id)
+{
+ try
{
- auto mem_layout = data_node.get_output_layout();
-
- return layout(mem_layout.data_type, mem_layout.format, { split * mem_layout.size.batch[0], mem_layout.size.feature[0], mem_layout.size.spatial[0], mem_layout.size.spatial[1] });
+ return *nodes_map.at(id);
}
-
- // pair.first tells whether l1 and l2 are absolutely identical
- // pair.second tells whether l1 and l2 can be reinterpreted to each other without need of reordering
- // note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted nor dropped)
- // note: if layouts describe two buffers with different size, consider them not to be identical even if smaller buffer can be considered to hold subsequence of larger buffer,
- // this behavior is required to force buffer allocation for smaller buffer which, currently, should always be performed
- std::pair<bool, bool> are_layouts_identical(layout const& l1, layout const& l2)
+ catch (...)
{
- if (l1 == l2)
- return{ true, true };
- if (l1.data_type != l2.data_type)
- return{ false, false };
- if (l1.size != l2.size)
- return{ false, false };
- if (l1.get_linear_size() != l2.get_linear_size())
- return{ false, false };
- if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
- (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16))
- return{ false, false };
-
- auto l1_pitch = l1.get_pitches();
- auto l2_pitch = l2.get_pitches();
-
- //ignore pitches which will never be used (for dims with size == 1)
- for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i)
- if (l1.size.raw[i] == 1)
- l1_pitch.raw[i] = 0;
- for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i)
- if (l2.size.raw[i] == 1)
- l2_pitch.raw[i] = 0;
-
- auto l1_offset = l1.get_linear_offset();
- auto l2_offset = l2.get_linear_offset();
- if (l1_pitch == l2_pitch && l1_offset == l2_offset)
- return{ false, true };
-
- return{ false, false };
+ throw std::runtime_error("Program doesn't contain primtive node: " + id);
}
}
-program_impl::program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal)
- : engine(&engine_ref), options(options), output_size_handling_enabled(true)
+program_node const& program_impl::get_node(primitive_id const& id) const
{
- static std::atomic<uint32_t> id_gen{ 0 };
- prog_id = ++id_gen;
- assert(prog_id != 0);
-
- if ((options.get<build_option_type::tuning_config>()->config.mode == tuning_mode::tuning_tune_and_cache) &&
- !engine->configuration().enable_profiling)
+ try
{
- throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!");
+ return *nodes_map.at(id);
}
-
- init_graph(topology);
- pre_optimize_graph();
- compile_graph();
- post_optimize_graph();
-
- engine->compile_program(*this);
- this->dump_program("13_finished", true);
-
- //Makes serialization with given name.
- //Placeholder, not working yet, in progress.
- auto serialization_network_name = get_serialization_network_name(options);
- if (!serialization_network_name.empty() && !is_internal)
+ catch (...)
{
- this->serialize(serialization_network_name);
+ throw std::runtime_error("Program doesn't contain primtive node: " + id);
}
-
- cleanup();
}
// TODO: Remove once we will get full support for input/output padding in all primitive implementations.
-void program_impl::analyze_output_size_handling_need()
+bool program_impl::analyze_output_size_handling_need()
{
bool handling_needed = false;
}
}
- output_size_handling_enabled = handling_needed;
+ return handling_needed;
}
-std::list<std::shared_ptr<program_node>> program_impl::get_nodes() const
+// create new nodes for a program based on the set of nodes
+// method created to be used by propagate_constants to build sub program from constant nodes
+void program_impl::prepare_nodes(std::set<std::shared_ptr<program_node>>const &nodes)
{
- std::list<std::shared_ptr<program_node>> ret;
-
- for (auto& node : processing_order)
- ret.push_back(nodes_map.at(node->id()));
- return ret;
+ for (const auto& itr : nodes)
+ {
+ if (itr.get()->is_type<data>())
+ {
+ get_or_create(
+ std::make_shared<input_layout>(itr.get()->id(), itr.get()->as<data>().get_primitive()->mem.get_layout())
+ );
+ }
+ else
+ {
+ get_or_create(itr->desc);
+ }
+ }
+ for (const auto& node : nodes_map)
+ {
+ auto node_ptr = node.second;
+ if (node_ptr == nullptr)
+ throw error("NULL pointer in nodes_map.", CLDNN_ERROR);
+ //ToDo: avoid O(n^2) run time here (pass map instead of set?)
+ bool found = false;
+ for (const auto& src_node : nodes)
+ {
+ if (src_node == nullptr)
+ throw error("NULL pointer in nodes_map.", CLDNN_ERROR);
+ if (node.first == src_node->get_primitive()->id)
+ {
+ copy_node_dependencies(node_ptr.get(), src_node.get());
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ {
+ add_node_dependencies(node_ptr.get());
+ }
+ if (node_ptr->dependencies.size() == 0)
+ inputs.push_back(node_ptr.get());
+ }
}
-void program_impl::init_graph(topology_impl const& topology)
+// create all nodes from topology primitives, add dependencies among them and create inputs list
+void program_impl::prepare_nodes(topology_impl const &topology)
{
auto const& topo_map = topology.get_primitives();
- for (auto const& prim : topo_map)
+ for (const auto& prim : topo_map)
{
- auto& n = get_or_create(prim.second);
- inputs.push_back(&n);
+ get_or_create(prim.second);
+ }
+ add_split_outputs();
+ for (const auto& node : nodes_map)
+ {
+ auto node_ptr = node.second.get();
+ if (node_ptr == nullptr)
+ throw error("NULL pointer in nodes_map.", CLDNN_ERROR);
+ add_node_dependencies(node_ptr);
+ if (node_ptr->dependencies.size()==0)
+ {
+ inputs.push_back(node_ptr);
+ }
}
- replace_nodes_pre();
+}
- for (auto itr = inputs.begin(); itr != inputs.end(); )
+// add node's dependecies from its primitive dependencies
+void program_impl::add_node_dependencies(program_node* node)
+{
+ auto deps = node->get_primitive()->dependencies();
+ //add pointers to node's dependencies
+ for (auto& dep : deps)
{
- auto node_itr = itr++;
- auto& node = (*node_itr);
- auto deps = node->get_primitive()->dependencies();
- if (deps.empty())
- continue;
+ try {
+ auto dep_node = nodes_map.at(dep);
+ node->dependencies.push_back(dep_node.get());
+ dep_node->users.push_back(node);
+ }
+ catch (...) {
+ throw std::runtime_error("Program doesn't contain primitive: " + dep +
+ " that is input to: " + node->get_primitive()->id);
+ }
+ }
+}
- //add pointers to node's dependencies
- for (auto& dep : deps)
- {
- try {
- auto dep_node = nodes_map.at(dep);
- node->dependencies.push_back(dep_node.get());
- dep_node->users.push_back(node);
- }
- catch (...) {
- throw std::runtime_error("Program doesn't contain primitive: " + dep +
- " that is input to: " + node->get_primitive()->id);
- }
+/* helper method for program_impl constructor from list of nodes which
+ copies src_node dependecies to the destination node dest_node dependencies.
+ But only to those which appaer in this program implementation nodes_map */
+void program_impl::copy_node_dependencies(program_node* dest_node, program_node* src_node)
+{
+ if (dest_node->get_primitive()->id != src_node->get_primitive()->id)
+ {
+ throw std::runtime_error("Node " + src_node->get_primitive()->id + " and its copy " + dest_node->get_primitive()->id + " do not match.");
+ }
+ auto src_deps = src_node->get_dependencies();
+ //add pointers to node's dependencies
+ for (auto& src_dep : src_deps)
+ {
+ // do not copy dependencies to nodes which does not belong to the new (subgraph) topology
+ if (nodes_map.find(src_dep->get_primitive()->id) == nodes_map.end()) continue;
+
+ try {
+ auto dest_dep = nodes_map.at(src_dep->get_primitive()->id);
+ dest_node->dependencies.push_back(dest_dep.get());
+ dest_dep->users.push_back(dest_node);
}
+ catch (...) {
+ throw std::runtime_error("Program doesn't contain primitive: " + src_dep->get_primitive()->id +
+ " that is input to: " + src_node->get_primitive()->id);
+ }
+ }
+}
+
+void program_impl::set_options()
+{
+ static std::atomic<uint32_t> id_gen{ 0 };
+ prog_id = ++id_gen;
+ assert(prog_id != 0);
- //primitive has dependencies so remove it from 'inputs'
- inputs.erase(node_itr);
+ if ((options.get<build_option_type::tuning_config>()->config.mode == tuning_mode::tuning_tune_and_cache) &&
+ !engine->configuration().enable_profiling)
+ {
+ throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!");
}
+}
- replace_nodes_post();
- handle_lstm();
- set_outputs();
- calc_processing_order();
+void program_impl::build_program(bool is_internal)
+{
+ init_graph();
+ {
+ pre_optimize_graph(is_internal);
+ }
+ run_graph_compilation();
+ {
+ post_optimize_graph(is_internal);
+ }
+ engine->compile_program(*this);
+ this->dump_program("finished", true);
+ cleanup();
+}
- dump_program("0_init", true);
+void program_impl::init_graph()
+{
+ graph_initializations graph_initializations_pass;
+ pm->run(*this, graph_initializations_pass);
+
+ calculate_prior_boxes calculate_prior_boxes_pass;
+ pm->run(*this, calculate_prior_boxes_pass);
+
+ mark_nodes mark_nodes_pass;
+ pm->run(*this, mark_nodes_pass);
+}
- calc_prior_boxes(); dump_program("1_calculated_prior_boxes", true);
- mark_constants();
- mark_data_flow();
- dump_program("2_analyzed_graph", true);
+void program_impl::run_graph_compilation() {
+ compile_graph compile_graph_pass;
+ pm->run(*this, compile_graph_pass);
}
-void program_impl::pre_optimize_graph()
+void program_impl::pre_optimize_graph(bool is_internal)
{
- trim_to_outputs(); dump_program("3_trimmed", true);
- calculate_BFS_processing_order();
- analyze_output_size_handling_need();
+ trim_to_outputs trim_pass; //trim to outputs
+ pm->run(*this, trim_pass); // ToDo remove hidden dependencies from trimm pass
+
+ handle_input_padding handle_input_padding; // handle symmetric and asymmetric padding for input
+ pm->run(*this, handle_input_padding);
+
+ add_reshape_to_primitives add_reshape_to_primitives_pass; // add reshape to input/parameters for some primitives
+ pm->run(*this, add_reshape_to_primitives_pass);
+
+ processing_order.calculate_BFS_processing_order(); // this method makes sense only for OOOQ (out of order execution queue)
+
+ bool output_size_handling_enabled = analyze_output_size_handling_need();
for (auto& node : processing_order)
{
if (!node->is_type<internal_primitive>() && !node->is_type<data>())
if (options.get<build_option_type::optimize_data>()->enabled())
{
- prepare_primitive_fusing();
+ prepare_primitive_fusing prepare_primitive_fusing_pass;
+ pm->run(*this, prepare_primitive_fusing_pass);
+
layout_optimizer lo(output_size_handling_enabled);
- reorder_inputs(lo);
- // this code should move to post compilation after kernel selector will support handling reorder bias
- pre_optimize_bias(lo);
- dump_program("4_reordered_inputs", true);
+ reorder_inputs reorder_inputs_pass(lo);
+ pm->run(*this, reorder_inputs_pass);
+
+ // this code should be moved to post compilation after kernel selector will support handling reorder bias
+ pre_optimize_bias pre_optimize_bias_pass(lo);
+ pm->run(*this, pre_optimize_bias_pass);
+
+ // passes regarding conv + eltwise optimizations
+
+ // shrinking eltwise if users are conv 1x1 with stride > 1 optimization
+ eltwise_shrinking eltwise_shrinking_pass;
+ pm->run(*this, eltwise_shrinking_pass);
+
+ // trying to set stride to 1x1 by shrinking convolutions before eltwise if doable
+ eltwise_remove_stride eltwise_remove_stride_pass;
+ pm->run(*this, eltwise_remove_stride_pass);
+
+ prepare_conv_eltw_fusing prepare_conv_eltw_fusing_pass;
+ pm->run(*this, prepare_conv_eltw_fusing_pass);
+
+ prepare_conv_eltw_read_write_opt prepare_conv_eltw_read_write_opt_pass;
+ pm->run(*this, prepare_conv_eltw_read_write_opt_pass);
}
handle_reshape();
- remove_redundant_reorders(); dump_program("5_removed_redundant_reorders", true);
- prepare_padding();
- prepare_depthwise_sep_opt();
- propagate_constants(); dump_program("6_propagated_constants", true);
+ remove_redundant_reorders remove_redundant_reorders_pass;
+ pm->run(*this, remove_redundant_reorders_pass);
+
+ prepare_padding prepare_padding_pass(output_size_handling_enabled);
+ pm->run(*this, prepare_padding_pass);
+
+ prepare_depthwise_sep_opt prepare_depthwise_sep_opt_pass;
+ pm->run(*this, prepare_depthwise_sep_opt_pass);
+
+ if (!is_internal)
+ {
+ propagate_constants propagate_constants_pass; // ToDo remove hidden dependencies from propagate_constants pass
+ pm->run(*this, propagate_constants_pass);
+ }
//try to fuse buffers (i.e. depth_concat in bfyx format) after padding calculations
if (options.get<build_option_type::optimize_data>()->enabled())
{
- prepare_buffer_fusing();
+ prepare_buffer_fusing prepare_buffer_fusing_pass;
+ pm->run(*this, prepare_buffer_fusing_pass);
}
- dump_program("7_pre_optimized", true);
+ //check if there exists some layout incompatibilities and add an reorder node if required
+ add_required_reorders add_required_reorders_pass;
+ pm->run(*this, add_required_reorders_pass);
}
-void program_impl::compile_graph()
+void program_impl::post_optimize_graph(bool is_internal)
{
- for (auto& node : processing_order)
+ layout_optimizer lo;
+ post_optimize_weights post_optimize_weights_pass(lo);
+ pm->run(*this, post_optimize_weights_pass);
+
+ remove_redundant_reorders remove_redundant_reorders_pass;
+ pm->run(*this, remove_redundant_reorders_pass); //TODO: do we need it at this place also?
+
+ if (!is_internal)
{
- if (!node->is_type<internal_primitive>() && !node->is_type<data>())
- {
- node->get_output_layout();
- if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()))
- node->selected_impl = node->type()->choose_impl(*engine, *node);
- }
+ propagate_constants propagate_constants_pass; // ToDo remove hidden dependencies from propagate_constants pass
+ pm->run(*this, propagate_constants_pass);
}
- dump_program("8_compiled", true);
+ prep_opt_depthwise_sep_post prep_opt_depthwise_sep_post_pass;
+ pm->run(*this, prep_opt_depthwise_sep_post_pass);
+
+ prepare_memory_dependencies();
}
-void program_impl::post_optimize_graph()
+// mark if the node is constant assuming that all dependencies are marked properly
+void program_impl::mark_if_constant(program_node& node)
{
- layout_optimizer lo;
- post_optimize_weights(lo); dump_program("9_reordered_weights", true);
- remove_redundant_reorders(); dump_program("10_removed_redundant_reorders", true); //TODO: do we need it at this place also?
- propagate_constants(); dump_program("11_propagated_constants", true);
- prep_opt_depthwise_sep_post();
- update_processing_numbers(); dump_program("12_validated_processing_order", true);
- prepare_memory_dependencies();
+ if (node.get_dependencies().empty())
+ return;
+ if (node.is_type<prior_box>())
+ return;
+ node.constant = true;
+ for (auto& dep : node.get_dependencies())
+ {
+ if (!dep->constant)
+ {
+ node.constant = false;
+ break;
+ }
+ }
}
+// mark if the node is in data flow assuming that all dependencies are marked properly
+void program_impl::mark_if_data_flow(program_node& node)
+{
+ if (node.is_type<mutable_data>() || node.is_type<input_layout>())
+ {
+ node.data_flow = true;
+ }
+ else
+ {
+ node.data_flow = false;
+ size_t inputs_count = node.get_dependencies().size();
+ if (node.is_type<detection_output>() || node.is_type<proposal>())
+ inputs_count = 2; //ignore third input as it is related to prior boxes (i.e. concat of prior-boxes)
+ for (size_t idx = 0; idx < inputs_count; idx++)
+ {
+ if (node.get_dependency(idx).is_in_data_flow())
+ {
+ node.data_flow = true;
+ return;
+ }
+ }
+ }
+}
void program_impl::cleanup()
{
for (auto& node : processing_order)
}
}
-std::string get_id_string(size_t i) {
- std::stringstream ss;
- ss << std::setw(5) << std::setfill('0') << i;
- return ss.str();
-}
-
-void program_impl::replace_nodes_pre()
-{
+void program_impl::add_split_outputs() {
auto itr = nodes_map.begin();
while (itr != nodes_map.end())
{
auto node_itr = itr++;
auto& node = (*node_itr).second;
- //find split primitives and create crop primitives out of them
if (node->is_type<split>())
{
auto split_prim = node->as<split>().typed_desc();
}
}
+program_impl::nodes_ordering& program_impl::get_processing_order()
+{
+ return processing_order;
+}
-void program_impl::replace_nodes_post()
+const program_impl::nodes_ordering& program_impl::get_processing_order() const
{
- auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed
- while (itr != nodes_map.end())
- {
- auto node_itr = itr++;
- auto& node = (*node_itr).second;
+ return processing_order;
+}
- //find split primitives and create crop primitives out of them
- if (node->is_type<split>())
+void add_memory_dependency(program_node* node, program_node* dep)
+{
+ if (node->can_be_optimized() ||
+ !dep->can_be_optimized())
+ {
+ node->add_memory_dependency(dep->id());
+ }
+ else
+ {
+ if (node->id() == dep->id())
+ {
+ return;
+ }
+ for (auto subdep : dep->get_dependencies())
{
- //check if split is not used by any primitive, as it will be optimized
- if (node->get_users().size() != 0)
- throw std::logic_error("Split layer cannot be used directly! Please use split output \"" + node->id() + ":<split_output_id>\"!");
+ add_memory_dependency(node, subdep);
+ add_memory_dependency(subdep, node);
+ }
+ }
+}
- //get_output size and validate split primitive inputs
- auto output_layout = node->get_output_layout();
- auto output_layout_size = output_layout.size;
+void program_impl::basic_memory_dependencies()
+{
+ auto itr = processing_order.begin();
+ std::vector<primitive_id> past_outputs;
+ while (itr != processing_order.end())
+ {
+ auto& node = *itr;
+ itr++;
- auto split_prim = node->as<split>().typed_desc();
- primitive_id input_id = split_prim->input[0];
- auto split_num = split_prim->output_offsets.size();
+ //data primitive can't be reused
+ if (node->is_type<data>())
+ continue;
- //create crop for each split ouptut provided
- for (decltype(split_num) i = 0; i < split_num; i++)
- {
- primitive_id output_id = node->id() + ":" + split_prim->output_ids[i];
+ // add my dependencies to restriction list (can't share input.output buffers)
+ for (auto it : node->get_dependencies())
+ {
+ add_memory_dependency(node, it);
+ add_memory_dependency(it, node);
+ }
- auto node_ptr = nodes_map.find(output_id)->second;
+ // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, this output
+ // has to land on the primitve restriction list. Otherwise memory reuse can corrupt final results.
+ node->add_memory_dependency(past_outputs);
+ // if current node is an output add it to the outputs list after restriction.
+ if (node->is_output())
+ past_outputs.push_back(node->id());
+ }
+}
- //calculate crop reference input size
- tensor reference_input_size;
- for (decltype(split_num) j = 0; j < i; j++)
- reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j];
+void program_impl::skipped_branch_memory_dependencies()
+{
+ // Primitive A can't use primitive B buffer if processing_num(B) < processing_num(A) and for any usr - the user of B processing_num(usr) > processing_num(A)
+ // Otherwise it could override data that has to be used in the future.
+ auto itrB = processing_order.begin();
+ while (itrB != processing_order.end())
+ {
+ auto& nodeB = *itrB;
+ auto itrA = ++itrB;
+ if (nodeB->get_users().size()==0)
+ continue;
- for (decltype(split_num) j = i; j < split_num - 1; j++)
- reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j];
-
- reference_input_size = output_layout_size - reference_input_size;
-
- //update crop primitive and add connections
- node_ptr->set_output_padding(output_layout.data_padding);
- auto crop_prim = node_ptr->as<crop>().typed_desc();
- crop_prim->reference_input = reference_input_size;
-
- add_connection(node->get_dependency(0), *node_ptr);
- }
-
- //remove input->split connection and remove original split node
- remove_connection(node->get_dependency(0), *node);
- optimized_out.push_back(node->id());
- nodes_map.erase(node->id());
- continue;
- }
-
- //find upsampling primitives with bilinear filtering and create deconvolution with proper weights instead
- if (node->is_type<upsampling>())
- {
- auto upsampling_prim = node->as<upsampling>().typed_desc();
-
- if (upsampling_prim->sample_type != upsampling_sample_type::bilinear)
- continue;
-
- //check if num_filter is not 0 (required for bilinear upsampling)
- if (upsampling_prim->num_filter == 0)
- throw std::logic_error("num_filter in upsampling cannot be 0 in bilinear filtering mode in \"" + node->id() + "\"!");
-
- primitive_id upsampling_id = node->id();
- auto& input_node = node->get_dependency(0);
-
- primitive_id input_id = upsampling_prim->input[0];
- auto num_filter = upsampling_prim->num_filter;
-
- //setting deconvolution parameters based on upsampling input
- auto scale = static_cast<tensor::value_type>(upsampling_prim->scale);
- tensor stride(1, 1, scale, scale);
- auto offset = static_cast<tensor::value_type>(std::ceil((scale - 1) / 2.f));
- tensor input_offset(0, 0, -offset, -offset);
-
- //setting weights for deconvolution
- auto kernel_size = static_cast<tensor::value_type>((2 * scale) - (scale % 2));
- layout weights_layout(data_types::f32, format::bfyx, tensor(1, 1, kernel_size, kernel_size));
-
- std::vector<primitive_id> weights_vec;
- for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++)
- {
- memory_impl::ptr data_to_allocate = engine->allocate_memory(weights_layout);
- mem_lock<float> dst{ data_to_allocate };
- float *dst_data = dst.data();
- //initialize with bilinear weights data
- auto f = static_cast<uint32_t>(std::ceil(kernel_size / 2.0f));
- float c = (2 * f - 1 - f % 2) / (2.f * f);
- float x = 0.f;
- float y = 0.f;
- for (size_t i = 0; i < weights_layout.count(); ++i) {
- x = static_cast<float>(i % kernel_size);
- y = static_cast<float>((i / kernel_size) % kernel_size);
- dst_data[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
- }
-
- //create weights primitive, with dummy memory which will be replaced in firther step
- primitive_id weights_id = upsampling_id + "_deconvolution_weights" + std::to_string(weights_idx);
- layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1));
- float zero = 0.f;
- auto weights_prim = std::make_shared<data>(weights_id, memory::attach(dummy_layout, &zero, 1));
- get_or_create(weights_prim);
-
- weights_vec.push_back(weights_id);
-
- auto weights_node_ptr = nodes_map.find(weights_id)->second;
-
- //attach weights buffer
- auto& data_node = weights_node_ptr->as<data>();
- data_node.attach_memory(*data_to_allocate, false);
- }
-
- //remove upsampling node, rename it and move to the optimized list
- remove_connection(node->get_dependency(0), *node);
- auto rename_id = upsampling_id + "_tmp";
- rename(*node, rename_id);
-
- //create deconvolution primitive
- auto deconv_prim = std::make_shared<deconvolution>(upsampling_id, input_id, weights_vec, stride, input_offset);
- get_or_create(deconv_prim);
-
- auto deconv_node_ptr = nodes_map.find(upsampling_id)->second;
-
- auto upsampling_node_ptr = nodes_map.find(rename_id)->second;
- replace_all_usages(*upsampling_node_ptr, *deconv_node_ptr);
- optimized_out.push_back(rename_id);
- nodes_map.erase(rename_id);
-
- //add connections input->deconvolution and weights->deconvolution
- add_connection(input_node, *deconv_node_ptr);
-
- for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++)
- {
- auto weights_node_ptr = nodes_map.find(weights_vec[weights_idx])->second;
- add_connection(*weights_node_ptr, *deconv_node_ptr);
- }
- continue;
- }
-
- //find deconvolution primitives with stride 1 and change them to convolution with trasposed weights
- if (node->is_type<deconvolution>())
- {
- if (!options.get<build_option_type::optimize_data>()->enabled())
- continue;
-
- auto deconv_prim = node->as<deconvolution>().typed_desc();
-
- //limit optimization to stride = 1
- if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient())
- continue;
-
- primitive_id deconv_id = node->id();
- auto& input_node = node->get_dependency(0);
-
- primitive_id input_id = deconv_prim->input[0];
-
- //setting convolution parameters based on deconvolution params
- auto stride = deconv_prim->stride;
- auto weights = deconv_prim->weights;
- std::vector<primitive_id> weights_vec;
- for (auto& weights_id : weights)
- weights_vec.push_back(weights_id);
- auto biases = deconv_prim->bias;
- std::vector<primitive_id> bias_vec;
- for (auto& bias_id : biases)
- bias_vec.push_back(bias_id);
- auto input_offset = deconv_prim->input_offset;
- auto with_activation = deconv_prim->with_activation;
- auto activation_negative_slope = deconv_prim->activation_negative_slope;
- auto output_padding = deconv_prim->output_padding;
-
- //remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list
- tensor filter_size = { 1, 1, 1, 1 };
- remove_connection(node->get_dependency(0), *node);
- for (auto& weights_id : weights_vec)
- {
- auto weights_node_ptr = nodes_map.find(weights_id)->second;
- remove_connection(*weights_node_ptr, *node);
- //get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls have same size
- if (weights_id == weights_vec[0])
- filter_size = weights_node_ptr->get_output_layout().size;
- }
-
- input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
- input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
-
- if (!bias_vec.empty())
- {
- for (auto& bias_id : bias_vec)
- {
- auto bias_id_node_ptr = nodes_map.find(bias_id)->second;
- remove_connection(*bias_id_node_ptr, *node);
- }
- }
- auto rename_id = deconv_id + "_tmp";
- rename(*node, rename_id);
-
- //create convolution primitive
- if (biases.size() != 0)
- {
- auto conv_prim = std::make_shared<convolution>(deconv_id, input_id, weights_vec, bias_vec,
- stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding);
- get_or_create(conv_prim);
- }
- else
- {
- auto conv_prim = std::make_shared<convolution>(deconv_id, input_id, weights_vec,
- stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding);
- get_or_create(conv_prim);
- }
-
- auto conv_node_ptr = nodes_map.find(deconv_id)->second;
- auto conv_node = &conv_node_ptr->as<convolution>();
- conv_node->set_transposed(true);
-
- //add connections input->convolution, weights->convolution and bias->convolution
- add_connection(input_node, *conv_node_ptr);
-
- for (auto& weights_id : weights_vec)
- {
- auto weights_node_ptr = nodes_map.find(weights_id)->second;
- add_connection(*weights_node_ptr, *conv_node_ptr);
- }
-
- if (!bias_vec.empty())
- {
- for (auto& bias_id : bias_vec)
- {
- auto bias_id_node_ptr = nodes_map.find(bias_id)->second;
- add_connection(*bias_id_node_ptr, *conv_node_ptr);
- }
- }
-
- auto deconv_node_ptr = nodes_map.find(rename_id)->second;
- replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
- optimized_out.push_back(rename_id);
- nodes_map.erase(rename_id);
-
- continue;
- }
- }
-}
-
-void program_impl::handle_lstm()
-{
- bool has_lstm_children;
- auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed
- while (itr != nodes_map.end())
- {
- auto node_itr = itr++;
- auto& node = (*node_itr).second;
- has_lstm_children = false;
- // replace lstm node with lstm_gemm and lstm_elt nodes
- if (node->is_type<lstm>()) {
- bool initial_hidden_term = node->as<lstm>().initial_hidden_term();
- bool initial_cell_term = node->as<lstm>().initial_cell_term();
- bool bias_term = node->as<lstm>().bias_term();
- auto lstm_prim = node->as<lstm>().typed_desc();
- primitive_id weights_id = lstm_prim->weights;
- primitive_id recurrent_id = lstm_prim->recurrent;
- primitive_id bias_id = bias_term ? lstm_prim->bias : "";
- primitive_id initial_hidden_id = initial_hidden_term ? lstm_prim->initial_hidden : "";
- primitive_id initial_cell_id = initial_cell_term ? lstm_prim->initial_cell : "";
- //removing connection with weights to get proper dependency order for next operations
- remove_connection(*nodes_map.at(weights_id), *node);
- remove_connection(*nodes_map.at(recurrent_id), *node);
- if (bias_term)
- remove_connection(*nodes_map.at(bias_id), *node);
- if (initial_hidden_term)
- remove_connection(*nodes_map.at(initial_hidden_id), *node);
- if (initial_cell_term)
- remove_connection(*nodes_map.at(initial_cell_id), *node);
-
- //calculating sizes
- auto input_size = node->get_dependency(0).get_output_layout().size;
- auto recurrent_size = nodes_map.at(recurrent_id)->get_output_layout().size;
- auto hidden_size = tensor(input_size.batch[0], 1, recurrent_size.spatial[0], input_size.feature[0]);
- size_t directions = recurrent_size.feature[0];
- size_t input_dependencies = node->get_dependencies().size();
- size_t sequence_len = node->as<lstm>().sequence_len();
-
- //if the sequence has a single element but it has multiple inputs then
- //the parent of this lstm is an lstm node. If this is a bidirectional lstm
- //then the sequence length is the number of dependencies divided by 2.
- if (sequence_len == 1 && input_dependencies > 1)
- sequence_len = (directions == 1) ? input_dependencies : input_dependencies / 2;
-
- //check if this lstm node has an lstm child
- for (auto& user : node->get_users())
- {
- if (user->is_type<lstm>())
- {
- has_lstm_children = true;
- }
- }
-
- std::vector<program_node*> cell_list(directions * sequence_len);
- std::vector<program_node*> concat_depends(directions * sequence_len);
- std::vector<primitive_id> output_ids_offsets(directions * sequence_len);
-
- primitive_id hidden_fwd_id = initial_hidden_id;
- primitive_id hidden_bwd_id = initial_hidden_id;
- primitive_id cell_fwd_id = initial_cell_id;
- primitive_id cell_bwd_id = initial_cell_id;
-
- auto split_direction = [&](const std::string gate, bool initial_term, primitive_id& fwd_id, primitive_id& bwd_id) {
- if (initial_term) {
- primitive_id initial_id = fwd_id;
- fwd_id = node->id() + ":" + gate + "_fwd";
- auto fwd_node = std::make_shared<crop>(fwd_id, initial_id, hidden_size, tensor{ 0,0,0,0 });
- auto &n1 = get_or_create(fwd_node);
- add_connection(*nodes_map.at(initial_id), n1);
- bwd_id = node->id() + ":" + gate + "_bwd";
- auto bwd_node = std::make_shared<crop>(bwd_id, initial_id, hidden_size, tensor{ 0,1,0,0 });
- auto &n2 = get_or_create(bwd_node);
- add_connection(*nodes_map.at(initial_id), n2);
- }
- };
-
- //if bidirectional lstm then initial_hidden and initial_cell terms need to be split
- if (directions > 1) {
- split_direction("hidden", initial_hidden_term, hidden_fwd_id, hidden_bwd_id);
- split_direction("cell", initial_cell_term, cell_fwd_id, cell_bwd_id);
- }
-
- //lstm expanding
- for (size_t dir = 0; dir < directions; ++dir) {
- auto hidden_id = dir == 0 ? hidden_fwd_id : hidden_bwd_id;
- auto cell_id = dir == 0 ? cell_fwd_id : cell_bwd_id;
- for (size_t i = 0; i < sequence_len; ++i) {
- size_t idx = i + dir * sequence_len;
- primitive_id lstm_gemm_id = node->id() + ":lstm_gemm" + get_id_string(idx);
- primitive_id lstm_elt_id = node->id() + ":lstm_elt" + get_id_string(idx);
- primitive_id crop_id = node->id() + ":crop" + get_id_string(idx);
-
- size_t input_idx = i;
- //for bidirectional lstms, if first LSTM layer then reverse input
- //for subsequent stacked layers the input is strided on the dir dimension
- if (directions > 0) {
- if (input_dependencies > sequence_len) { // stacked layer
- input_idx = dir * sequence_len + i;
- }
- else
- {
- if (dir > 0) { // first layer
- input_idx = sequence_len - i - 1;
- }
- }
- }
- primitive_id lstm_gemm_input_id = node->get_dependency(input_idx).get_org_primitive_id();
-
- auto lstm_gemm_node = std::make_shared<lstm_gemm>(lstm_gemm_id, lstm_gemm_input_id, weights_id, recurrent_id, bias_id, hidden_id, (uint32_t)dir);
- auto &n1 = get_or_create(lstm_gemm_node);
-
- auto lstm_elt_node = std::make_shared<lstm_elt>(lstm_elt_id, lstm_gemm_id, cell_id, lstm_prim->clip, lstm_prim->input_forget,
- lstm_prim->activations, lstm_prim->activation_params, lstm_prim->offset_order);
- auto &n2 = get_or_create(lstm_elt_node);
- //adding lstm_elt as user
- add_connection(n1, n2);
- //adding dependecy to lstm_gemm node
- //input
- add_connection(node->get_dependency(input_idx), n1);
- //adding weights and initial values to lstm_gemm
- add_connection(*nodes_map.at(weights_id), n1);
- add_connection(*nodes_map.at(recurrent_id), n1);
- if (bias_term)
- add_connection(*nodes_map.at(bias_id), n1);
-
- //adding cell and hiddens as dependencies
- if (i > 0)
- {
- add_connection(*cell_list[size_t(i - 1) * directions + dir], n2);
- add_connection(*(concat_depends[size_t(i - 1) * directions + dir]), n1);
- }
- //if initial values are present
- else
- {
- if (initial_hidden_term)
- add_connection(*nodes_map.at(hidden_id), n1);
- if (initial_cell_term)
- add_connection(*nodes_map.at(cell_id), n2);
- }
-
- //lstm_hidden
- hidden_id = crop_id + ":hidden";
- auto crop_hidden = std::make_shared<crop>(hidden_id, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 });
- auto &n3 = get_or_create(crop_hidden);
- //adding eltwise as dependency to hidden
- add_connection(n2, n3);
-
- //if parent is lstm adding hiddens as dependency
- if (has_lstm_children)
- {
- for (auto& user : node->get_users())
- {
- add_connection(n3, *user);
- }
- }
- concat_depends[i * directions + dir] = &n3;
-
- //lstm_cell
- if (i < sequence_len - 1) {
- cell_id = crop_id + ":cell";
- auto crop_cell = std::make_shared<crop>(cell_id, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 });
- auto &n4 = get_or_create(crop_cell);
- add_connection(n2, n4);
- cell_list[i * directions + dir] = &n4;
- }
- output_ids_offsets[i * directions + dir] = hidden_id;
- }
- }
-
- //if there is no next lstm, concatenation is created
- if (!has_lstm_children)
- {
- primitive_id original_id = node->id();
- primitive_id concatenation_id = original_id + ":concat";
- auto concatenation_primitive = std::make_shared<concatenation>(concatenation_id, output_ids_offsets, concatenation::along_f);
- auto &concatenation_node = get_or_create(concatenation_primitive);
- for (auto sub_dependency : concat_depends)
- {
- add_connection(*sub_dependency, concatenation_node);
- }
- if (directions == 2) {
- // bidirectional support requires concatenations along the direction and sequence axis
- // instead we can concatenate along the sequence axis and reshape the tensor to the account
- // for the direction
- tensor output_size {input_size.batch[0], (int32_t)sequence_len, hidden_size.spatial[0], (int32_t)directions};
- primitive_id reshape_id = original_id + ":reshape";
- auto reshape_primitive = std::make_shared<reshape>(reshape_id, concatenation_id, output_size);
- auto &reshape_node = get_or_create(reshape_primitive);
- add_connection(concatenation_node, reshape_node);
- for (auto& user : node->get_users())
- {
- add_connection(reshape_node, *user);
- }
- }
- }
-
- //removing expanded node
- remove_all_connections(*node);
- nodes_map.erase(node->id());
- continue;
- }
- }
-
-}
-
-void program_impl::set_outputs()
-{
- auto outputs_option = options.get<build_option_type::outputs>();
- if (!outputs_option->outputs.empty())
- {
- for (auto const& output : outputs_option->outputs)
- {
- auto o_node = nodes_map.at(output);
- o_node->set_output(true);
- outputs.push_back(o_node.get());
- }
- }
- else
- {
- for (auto& node : nodes_map)
- if (node.second->is_endpoint())
- {
- node.second->set_output(true);
- outputs.push_back(node.second.get());
- }
- }
-}
-
-void program_impl::calc_processing_order()
-{
- processing_order.clear();
-
- //run dfs to sort nodes topologically
- for (auto input : inputs)
- {
- if (input->is_marked())
- continue;
-
- input->mark();
- std::list<std::pair<program_node*, std::list<program_node*>::const_iterator>> stack = { std::make_pair(input, input->users.begin()) };
-
- while (!stack.empty()) //imitate call stack
+ // find the last user of B in processing order
+ auto itrUsr = nodeB->get_users().begin();
+ auto lastUsr = itrUsr++;
+ while (itrUsr != nodeB->get_users().end())
{
- new_frame:
- auto& frame = stack.back();
-
- while (frame.second != frame.first->users.end())
- {
- auto successor = *frame.second;
- ++frame.second;
-
- if (!successor->is_marked())
- {
- successor->mark();
-
- //recurrence call
- stack.push_back(std::make_pair(successor, successor->users.begin()));
- goto new_frame;
- }
- }
-
- //we have finished processing one node so add it to the processing queue
- processing_order.push_front(frame.first);
- frame.first->processing_itr = processing_order.begin();
-
- //return from call
- stack.pop_back();
+ if (processing_order.get_processing_number(*lastUsr) < processing_order.get_processing_number(*itrUsr))
+ lastUsr = itrUsr;
+ itrUsr++;
}
- }
-
- uint32_t idx = 0;
- for (auto& node : processing_order)
- {
- node->processing_num = ++idx;
- node->unmark();
- }
-}
-
-void program_impl::update_processing_numbers()
-{
- uint32_t idx = 0;
- for (auto& node : processing_order)
- {
- node->processing_num = ++idx;
- }
- for (auto& node : processing_order)
- {
- if (!processing_order_is_correct(node))
+ //mark all nodes in between B and lastUsr of B as forbidden to share buffer with B
+ while (itrA != processing_order.get_processing_iterator(**lastUsr))
{
- CLDNN_ERROR_MESSAGE(node->id(), "Incorrect processing order");
- return;
+ auto& nodeA = *itrA;
+ itrA++;
+ add_memory_dependency(nodeA, nodeB);
+ add_memory_dependency(nodeB, nodeA);
}
}
}
-void program_impl::calc_prior_boxes()
+void program_impl::oooq_memory_dependencies()
{
auto itr = processing_order.begin();
- while (itr != processing_order.end())
- {
- auto& node = (*itr++);
- if (!node->is_type<prior_box>())
- continue;
-
- auto& pb_node = node->as<prior_box>();
-
- pb_node.calc_result();
- remove_connection(pb_node.input(), pb_node);
-
- auto& result = pb_node.get_result_buffer();
- result.add_ref(); // need to inc ref count since we will be assigning this memory as cldnn_memory in next line that is not ref_count_obj
- auto cpp_mem = details::memory_c_to_cpp_converter::convert(api_cast(&result));
-
- auto& data_node = get_or_create(std::make_shared<data>("_cldnn_tmp_" + pb_node.id() + "_result", cpp_mem));
- replace(pb_node, data_node, false, false);
- }
-}
-
-
+ // This order let us build dependencies based on syncing points.
+ // Set of nodes between two syncing points will be called sync_region.
+ // Major rules is: can't share resource with nodes in my sync_region
-void program_impl::mark_constants()
-{
- for (auto& node : processing_order)
+ int32_t last_barrier = 0;
+ bool needs_barrier = false;
+ std::vector<cldnn::program_node*> sync_region;
+ while (itr != processing_order.end())
{
- if (node->dependencies.empty())
- continue;
- if (node->is_type<prior_box>())
- continue;
+ auto& node = *itr;
+ itr++;
- node->constant = true;
- for (auto& dep : node->get_dependencies())
+ // if any of dep has proccess num after barrier -> needs barrier
+ for (auto dep : node->get_dependencies())
{
- if (!dep->constant)
+ if (processing_order.get_processing_number(dep) >= last_barrier)
{
- node->constant = false;
+ needs_barrier = true;
break;
}
}
- if (!node->constant)
- for (auto& dep : node->get_dependencies())
- if (dep->constant)
- dep->constant_frontier = true;
- }
-}
-
-void program_impl::mark_data_flow()
-{
- std::list<program_node*> stack;
- for (auto const& node : processing_order)
- {
- if ((node->is_endpoint() && !node->constant) || node->is_type<mutable_data>())
+ if (needs_barrier)
{
- stack.push_back(node);
- node->data_flow = true;
- node->mark();
- }
- }
-
- while (!stack.empty())
- {
- auto node = stack.front();
- stack.pop_front();
-
- size_t dep_idx = 0;
- size_t inputs_count = (node->is_type<internal_primitive>() ? node->get_dependencies().size() : node->get_primitive()->input.size());
- //TODO: remove this hack after addition of constants propagation pass
- if (node->is_type<detection_output>() || node->is_type<proposal>())
- inputs_count = 2; //ignore third input as it is related to prior boxes (i.e. concat of prior-boxes)
+ last_barrier = processing_order.get_processing_number(node);
+ needs_barrier = false;
+ // add each pair bi-direction dependency
+ for (auto nd1 = sync_region.begin(); nd1 + 1 != sync_region.end(); nd1++)
+ {
+ for (auto nd2 = nd1 + 1; nd2 != sync_region.end(); nd2++)
+ {
+ add_memory_dependency(*nd1, *nd2);
+ add_memory_dependency(*nd2, *nd1);
+ }
+ }
- for (auto dep : node->get_dependencies())
- {
- bool data_flow = (dep_idx < inputs_count && !dep->constant);
- ++dep_idx;
- if (!data_flow)
- continue;
+ // collect dependencies of every node in sync region
+ std::vector<cldnn::program_node*> deps;
+ for (auto& nd_in_region : sync_region)
+ for (auto& dep : nd_in_region->get_dependencies())
+ deps.emplace_back(dep);
- dep->data_flow = data_flow;
- if (dep->is_marked())
- continue;
+ for (auto& nd_in_region : sync_region)
+ for (auto& dep : deps)
+ {
+ add_memory_dependency(nd_in_region, dep);
+ add_memory_dependency(dep, nd_in_region);
+ }
- stack.push_back(dep);
- dep->mark();
+ sync_region.clear();
}
- }
-
- for (auto& node : processing_order)
- {
- assert(!node->constant || !node->data_flow); //node which is constant cannot be marked as data flow
- node->unmark();
+ sync_region.push_back(node);
}
}
-void program_impl::trim_to_outputs()
+void program_impl::prepare_memory_dependencies()
{
- size_t actual_nodes = processing_order.size();
- if (!actual_nodes) //degenerated case but can happen
- return;
-
- if (outputs.size() == actual_nodes)
- return;
-
- //do backward bfs starting from all outputs
- std::list<const std::vector<program_node*>*> stack = { &outputs };
- while (!stack.empty())
- {
- auto nodes_list = stack.front();
- stack.pop_front();
-
- for (auto node : *nodes_list)
- {
- if (!node->is_marked())
- {
- node->mark();
- if (!node->get_dependencies().empty())
- stack.push_back(&node->get_dependencies());
- }
- }
- }
-
- //all not-marked nodes should be removed
- std::list<program_node*> to_rem;
- for (auto node : processing_order)
- {
- if (node->is_type<input_layout>()) //input layout may become disconnected during prior boxes calculations so it may have not been marked at this place but we don't want to remove it
- node->mark();
- else if (!node->is_marked())
- to_rem.push_back(node);
- }
-
- for (auto const& node : to_rem)
- {
- if (node->is_input())
- inputs.remove(node);
- else
- {
- for (auto dep : node->dependencies)
- if (dep->is_marked())
- dep->users.remove(node);
- }
-
- for (auto user : node->users)
- if (user->is_marked())
- user->dependencies.erase(std::remove(user->dependencies.begin(), user->dependencies.end(), node), user->dependencies.end());
-
- optimized_out.push_back(node->id());
- nodes_map.erase(node->id());
- }
-}
-
-void add_memory_dependency(program_node* node, program_node* dep)
-{
- if (node->can_be_optimized() ||
- !dep->can_be_optimized())
- {
- node->add_memory_dependency(dep->id());
- }
- else
- {
- if (node->id() == dep->id())
- {
- return;
- }
- for (auto subdep : dep->get_dependencies())
- {
- add_memory_dependency(node, subdep);
- add_memory_dependency(subdep, node);
- }
- }
-}
-
-void program_impl::basic_memory_dependencies()
-{
- auto itr = processing_order.begin();
- std::vector<primitive_id> past_outputs;
- while (itr != processing_order.end())
- {
- auto& node = *itr;
- itr++;
-
- //data primitive can't be reused
- if (node->is_type<data>())
- continue;
-
- // add my dependencies to restriction list (can't share input.output buffers)
- for (auto it : node->get_dependencies())
- {
- add_memory_dependency(node, it);
- add_memory_dependency(it, node);
- }
-
- // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, this output
- // has to land on the primitve restriction list. Otherwise memory reuse can corrupt final results.
- node->add_memory_dependency(past_outputs);
- // if current node is an output add it to the outputs list after restriction.
- if (node->is_output())
- past_outputs.push_back(node->id());
- }
-}
-
-void program_impl::skipped_branch_memory_dependencies()
-{
- auto itr = processing_order.begin();
- // Primitive A can't use primitive B buffer if B->processing_num < A->processing_num and any of B users processing_num > A->processing_num
- // Otherwise it could override data that has to be used in the future.
- // TODO: improve algorithm to to O(n*log(n))
- while (itr != processing_order.end())
- {
- auto& node = *itr;
- itr++;
- auto itr2 = processing_order.begin();
- if (itr2 == itr)
- continue;
- while (itr2 != processing_order.end())
- {
- auto& node2 = *itr2;
- itr2++;
- if (node2->get_processing_num() < node->get_processing_num())
- {
- // if at least one user will be processed after 'node', node2 has to be added to forbiden list
- for (auto usr : node2->get_users())
- {
- if (usr->get_processing_num() > node->get_processing_num())
- {
- add_memory_dependency(node, node2);
- add_memory_dependency(node2, node);
- break;
- }
- }
- }
- }
- }
-}
-
-void program_impl::oooq_memory_dependencies()
-{
- auto itr = processing_order.begin();
- // This order let us build dependencies based on syncing points.
- // Set of nodes between two syncing points will be called sync_region.
- // Major rules is: can't share resource with nodes in my sync_region
-
- uint32_t last_barrier = 0;
- bool needs_barrier = false;
- std::vector<cldnn::program_node*> sync_region;
- while (itr != processing_order.end())
- {
- auto& node = *itr;
- itr++;
-
- // if any of dep has proccess num after barrier -> needs barrier
- for (auto dep : node->get_dependencies())
- {
- if (dep->get_processing_num() >= last_barrier)
- {
- needs_barrier = true;
- break;
- }
- }
-
- if (needs_barrier)
- {
- last_barrier = node->get_processing_num();
- needs_barrier = false;
- // add each pair bi-direction dependency
- for (auto nd1 = sync_region.begin(); nd1 + 1 != sync_region.end(); nd1++)
- {
- for (auto nd2 = nd1 + 1; nd2 != sync_region.end(); nd2++)
- {
- add_memory_dependency(*nd1, *nd2);
- add_memory_dependency(*nd2, *nd1);
- }
- }
-
- // collect dependencies of every node in sync region
- std::vector<cldnn::program_node*> deps;
- for (auto& nd_in_region : sync_region)
- for (auto& dep : nd_in_region->get_dependencies())
- deps.emplace_back(dep);
-
-
- for (auto& nd_in_region : sync_region)
- for (auto& dep : deps)
- {
- add_memory_dependency(nd_in_region, dep);
- add_memory_dependency(dep, nd_in_region);
- }
-
- sync_region.clear();
- }
- sync_region.push_back(node);
- }
-}
-
-void program_impl::prepare_memory_dependencies()
-{
- if (!get_engine().configuration().enable_memory_pool)
- return;
-
- basic_memory_dependencies();
- skipped_branch_memory_dependencies();
- oooq_memory_dependencies();
-}
-
-std::string program_impl::get_memory_dependencies_string() const
-{
- std::string mem_dep = "Memory dependencies/restrictions:\n";
- auto itr = processing_order.begin();
- while (itr != processing_order.end())
- {
- auto& node = *itr;
- itr++;
- mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: ");
- for (auto it : node->get_memory_dependencies())
- mem_dep == mem_dep.append(it).append(", ");
- mem_dep = mem_dep.append("\n");
- }
- return mem_dep;
-}
-
-void program_impl::remove_redundant_reorders()
-{
- auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed
- while (itr != processing_order.end())
- {
- auto& node = (*itr++); //post-inc to avoid invalidation due to possible erase
- if (!node->is_type<reorder>()) //only care for reorders
- continue;
-
- program_node* current_node = node;
- std::vector<program_node*> r_nodes_to_remove;
-
- auto optimize = true;
- while (current_node)
- {
- auto& r_node = current_node->as<reorder>();
- current_node = nullptr;
-
- if (r_node.has_mean() || !r_node.get_primitive()->subtract_per_feature.empty() || //do not optimize if mean of subtract are present
- (r_node.is_output() && r_node.get_dependency(0).is_output())) //do not optimize when both reorder and layer before are outputs
- {
- optimize = false;
- break;
- }
-
- r_nodes_to_remove.push_back(&r_node);
-
- if (r_node.get_dependency(0).is_type<reorder>() && r_node.get_dependencies().size() == 1 && r_node.get_users().size() == 1 && r_node.get_dependency(0).get_users().size() == 1)
- current_node = &r_node.get_dependency(0);
- }
- if (!optimize)
- continue;
-
- assert(node->dependencies.size() == 1 && "reorder without mean should have exactly one dependecy (input)");
- auto& r_output = r_nodes_to_remove.front();
- auto& r_input = r_nodes_to_remove.back()->get_dependency(0);
- auto o_layout = r_output->get_output_layout();
- auto i_layout = r_input.get_output_layout();
-
- auto ident = are_layouts_identical(o_layout, i_layout);
- if (!ident.second)
- continue;
-
- for (auto remove_reorder_node : r_nodes_to_remove)
- {
- auto& r_node = remove_reorder_node->as<reorder>();
-
- if (ident.first && ident.second && r_node.is_output() && r_node.get_dependency(0).is_input()) //do not optimize when reorder is output and layer before is input
- {
- optimize = false;
- break;
- }
- }
- if (!optimize)
- continue;
-
- for (auto remove_reorder_node : r_nodes_to_remove)
- {
- auto& r_node = remove_reorder_node->as<reorder>();
-
- //mark as optimized
- r_node.can_be_optimized(true);
- r_node.requires_reinterpret(!ident.first);
- if (ident.first) //no need of reshape
- extract_and_remove(r_node); //try to remove if possible (with respect to r_node not being marked as output)
- }
- }
-}
-
-/*
- recalculate processing_order
- algorithm based on: CLRS 24.5 (critical path in DAG)
- modifications: adjust for multiple inputs
- input: any topological order in processing order
- output: BFS topological order.
-*/
-
-void program_impl::calculate_BFS_processing_order() {
- std::map<program_node*, int> distances;
- for (auto itr : processing_order)
- {
- distances[itr] = -1;
- }
- int max_distance = 0;
- for (auto itr : processing_order)
- {
- //Init
- if (distances[itr] == -1) { // this must be an input
- distances[itr] = 0; // initialize input
- }
- // RELAX
- for (auto& user : itr->get_users())
- {
- distances[user] = std::max(distances[user], distances[itr] + 1);
- max_distance = std::max(max_distance, distances[user]);
- }
- }
-
- //bucket sort nodes based on their max distance from input
- std::vector<std::vector<program_node*>> dist_lists;
- dist_lists.resize(max_distance + 1);
- for (auto itr : processing_order)
- {
- dist_lists[distances[itr]].push_back(itr);
- }
-
- //replace the old processing order by the new one, still topological.
- processing_order.clear();
- for (auto& dist : dist_lists)
- {
- for (auto& node : dist)
- {
- processing_order.push_back(node);
- node->processing_itr = processing_order.end();
- node->processing_itr--;
- }
- }
- update_processing_numbers();
- return;
-}
-
-void program_impl::reorder_inputs(layout_optimizer& lo)
-{
- //first pass to set layout optimization_attributes for topology
- for (auto& p : nodes_map)
- {
- auto& prim = *p.second;
- if (prim.type() == cldnn::convolution::type_id())
- {
- if (prim.as<convolution>().get_primitive()->split() > 1)
- lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1);
- }
-
- //list of layers that do not support yxfb or perform worse than bfyx
- if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() ||
- prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() ||
- prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id())
- lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1);
- }
-
- const auto reorder_input = [this, &lo](typed_program_node<convolution>& conv_node)
- {
- auto conv_prim = conv_node.get_primitive();
- auto& input_node = conv_node.get_dependency(0);
- auto&& weights_layout = conv_node.weights(0).get_output_layout();
- auto&& input_layout = input_node.get_output_layout();
-
- std::shared_ptr<reorder> new_input = nullptr;
-
- if (input_node.type() == reorder::type_id()) //convolution's input is a reorder
- {
- auto reorder_prim = input_node.as<reorder>().typed_desc();
- auto& reorder_input = input_node.get_dependency(0);
- auto reorder_layout = input_node.get_output_layout();
- reorder_layout.data_type = reorder_prim->output_data_type;
- new_input = lo.get_reorder(
- reorder_layout,
- reorder_prim->id,
- layout_optimizer::data_type::input,
- conv_node,
- weights_layout).first;
-
- auto reorder_removed = false;
- if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal
- {
- auto reorder_input_layout = reorder_input.get_output_layout();
-
- auto opt_layout = layout(new_input->output_data_type, new_input->output_format, reorder_input_layout.size);
- if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format
- {
- if (reorder_prim->subtract_per_feature.empty() &&
- reorder_prim->mean.empty() &&
- !reorder_prim->output_padding) //just plain reorder
- {
- conv_node.replace_dependency(0, reorder_input);
- if (input_node.get_users().size() == 0 && !input_node.is_output())
- {
- reorder_removed = extract_and_remove(input_node);
- }
- new_input = nullptr;
- }
- else //change reorder's output layout
- {
- reorder_prim->output_format = opt_layout.format;
- reorder_prim->output_data_type = opt_layout.data_type;
- new_input = nullptr;
- }
- }
- else //current reorder gives bad output, simply change it
- {
- reorder_prim->output_format = opt_layout.format;
- reorder_prim->output_data_type = opt_layout.data_type;
- new_input = nullptr;
- }
- }
-
- if (!reorder_removed)
- input_node.recalc_output_layout();
- else
- conv_node.recalc_output_layout();
- }
- else
- {
- new_input = lo.get_reorder(
- input_node.get_output_layout(),
- input_node.id(),
- layout_optimizer::data_type::input,
- conv_node,
- weights_layout).first;
- }
-
- if (new_input && new_input->output_format == format::winograd_2x3_s1_data)
- {
- auto lower_size = (conv_prim->input_offset.negate() + input_layout.size);
-
- tensor upper_input_padding = tensor{ 0 };
- upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2; //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0
- upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8; //for y, y - 2 % 8 == 0 must hold
-
- apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() });
-
- auto winograd_output = std::make_shared<reorder>("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format,
- input_layout.data_type, std::vector<float>{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding);
- conv_node.output_layout.data_padding = padding{};
- auto& back_node = get_or_create(winograd_output);
- back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node);
-
- auto bias_term = conv_node.bias_term();
- //create additional eltwise node after reorder to compute bias
- if (bias_term)
- {
- auto& bias_node = conv_node.get_dependency(2);
- std::vector<primitive_id> inputs = { back_node.id(), bias_node.id() };
- auto winograd_output_biases = std::make_shared<eltwise>(back_node.id() + "_bias", inputs,
- cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope,
- back_node.output_layout.data_padding);
- back_node.output_layout.data_padding = padding{};
- auto& back_bias_node = get_or_create(winograd_output_biases);
- back_bias_node.processing_itr = processing_order.insert(std::next(back_node.processing_itr), &back_bias_node);
- replace_all_usages(back_node, back_bias_node);
- add_connection(back_node, back_bias_node);
- add_connection(bias_node, back_bias_node);
- conv_node.invalidate_users();
- replace_all_usages(conv_node, back_bias_node);
- }
-
- if (conv_prim->with_activation)
- {
- conv_node.typed_desc()->with_activation = false;
- if (!bias_term)
- back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope, 0.0f });
- }
-
- if (!bias_term)
- {
- conv_node.invalidate_users();
- replace_all_usages(conv_node, back_node);
- }
- add_connection(conv_node, back_node);
-
- auto& r_node = get_or_create(new_input);
- r_node.as<reorder>().set_input_offset(conv_prim->input_offset);
-
- if (!bias_term)
- {
- swap_names(conv_node, back_node);
- if (conv_node.is_output())
- {
- conv_node.set_output(false);
- back_node.set_output(true);
- for (auto& output : outputs)
- {
- if (output == &conv_node)
- {
- output = &back_node;
- break;
- }
- }
- }
- }
- else
- {
- conv_node.remove_dependency(2);
- auto& back_bias_node = *nodes_map.find(back_node.id() + "_bias")->second;
- swap_names(conv_node, back_bias_node);
- if (conv_node.is_output())
- {
- conv_node.set_output(false);
- back_bias_node.set_output(true);
- for (auto& output : outputs)
- {
- if (output == &conv_node)
- {
- output = &back_bias_node;
- break;
- }
- }
- }
- }
- }
-
- if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf))
- {
- auto conv1x1_output = std::make_shared<reorder>("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type);
- auto& back_node = get_or_create(conv1x1_output);
- back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node);
-
- conv_node.invalidate_users();
- replace_all_usages(conv_node, back_node);
- add_connection(conv_node, back_node);
- }
-
- if (new_input)
- {
- auto& r_node = get_or_create(new_input);
- add_intermediate(r_node, conv_node, 0, r_node.dependencies.empty());
- conv_node.recalc_output_layout();
- }
- };
-
- const auto reorder_input_detection_output = [this, &lo](typed_program_node<detection_output>& detection_output_node)
- {
- auto detection_output_prim = detection_output_node.get_primitive();
-
- for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++)
- {
- auto& input = detection_output_node.get_dependency(i);
- std::shared_ptr<reorder> new_input = lo.get_reorder(
- input.get_output_layout(),
- input.id(),
- layout_optimizer::data_type::input,
- detection_output_node,
- layout{ data_types::f32, format::bfyx, tensor{} }).first;
-
- if (new_input)
- {
- add_intermediate(new_input, detection_output_node, i);
- }
- }
- };
-
- for (auto& prim : processing_order)
- {
- //there's an assumption that only convolution will take data/input_layout as input
- //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above
- do_for_types<convolution, detection_output>(*prim,
- reorder_input, //case for convolution
- reorder_input_detection_output //case for detection-output
- );
- }
-}
-
-//function which prepares given primitive for weights optimization
-template <typename T>
-void program_impl::optimize_bias(T& node, layout_optimizer& lo)
-{
- layout output_layout = node.get_output_layout();
-
- size_t weights_offset = node.get_primitive()->input.size();
- size_t bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size();
- for (size_t i = bias_offset; i < node.get_dependencies().size(); ++i)
- {
- //find weights primitive with given pimitive_id and add it to weights_optimizer
- const program_node& bias = node.get_dependency(i);
- const auto bias_type = layout_optimizer::data_type::bias;
- auto reorder = lo.get_reorder(
- bias.get_output_layout(),
- bias.id(),
- bias_type,
- node,
- output_layout);
-
- if (reorder.first)
- this->add_intermediate(reorder.first, node, i, !reorder.second);
- }
-}
-template void program_impl::optimize_bias<convolution_node>(convolution_node& node, layout_optimizer& lo);
-template void program_impl::optimize_bias<deconvolution_node>(deconvolution_node& node, layout_optimizer& lo);
-template void program_impl::optimize_bias<fully_connected_node>(fully_connected_node& node, layout_optimizer& lo);
-template void program_impl::optimize_bias<embed_node>(embed_node& node, layout_optimizer& lo);
-
-void program_impl::pre_optimize_bias(layout_optimizer& lo)
-{
- for (auto& p : nodes_map)
- {
- auto& prim = *p.second;
- if (prim.type() == convolution::type_id())
- {
- if (!prim.as<convolution>().weights_quantization_term())
- optimize_bias(prim.as<convolution>(), lo);
- }
- else if (prim.type() == deconvolution::type_id())
- {
- optimize_bias(prim.as<deconvolution>(), lo);
- }
- else if (prim.type() == fully_connected::type_id())
- {
- if (!prim.as<fully_connected>().weights_quantization_term())
- optimize_bias(prim.as<fully_connected>(), lo);
- }
- else if (prim.type() == embed::type_id())
- {
- optimize_bias(prim.as<embed>(), lo);
- }
- }
-}
-
-template <typename T>
-void program_impl::optimize_depthwise_sep_pre(T& node)
-{
- //enable optimization only when IFM / split <= 8 (otherwise scheduling multiple opt kernels is better) and split >= 16
- if (!(node.get_dependency(0).get_output_layout().size.feature[0] / node.get_primitive()->split() <= 8) ||
- !(node.get_primitive()->split() >= 16))
- return;
-
- //make sure the weights and biases are data type and
- //are not reused in other primitives as they will be overriden with concatenated ones
- for (size_t i = 1; i < node.get_dependencies().size(); i++)
- {
- auto& weights_or_biases = node.get_dependency(i);
- if (weights_or_biases.get_users().size() > 1 || weights_or_biases.type() != data::type_id())
- return;
- }
-
- node.set_depthwise_sep_opt(true);
-}
-template void program_impl::optimize_depthwise_sep_pre<convolution_node>(convolution_node& node);
-template void program_impl::optimize_depthwise_sep_pre<deconvolution_node>(deconvolution_node& node);
-
-void program_impl::prepare_depthwise_sep_opt()
-{
- //depthiwise separated convolution/deconvolution optimization
- for (auto& p : nodes_map)
- {
- auto& prim = *p.second;
- if (prim.type() == convolution::type_id())
- {
- optimize_depthwise_sep_pre(prim.as<convolution>());
- }
- else if (prim.type() == deconvolution::type_id())
- {
- optimize_depthwise_sep_pre(prim.as<deconvolution>());
- }
- }
-}
-
-void program_impl::handle_reshape()
-{
- //reshape primitive by definition does not change underlying data, only shape description
- //however during graph initialization and data optimization the layouts can be changed without user's knowledge,
- //when reshape is followed by reorder, it is likely that reorder's output will not be as expected (for example reshape with flattened shape)
- //this pass resolved the issue by changing graph in the following way
- //- in case reshape has multiple users with reshape->reorder sequence, it will be splitted to multiple reshape primitives with single user
- //- in case of reshape->reorder sequence, the additional reorder before reshape will be added,
- // if last reorder does not contain padding or mean subtract, it will be removed later in the graph
-
- for (const auto& node : processing_order)
- {
- if (node->is_type<reshape>())
- {
- auto& input_node = node->get_dependency(0);
-
- if (input_node.is_type<reorder>())
- continue;
-
- //vector for storing nodes that are reorder type, for which splitted primitives are needed (except for the first one where orginal reshape will be used)
- std::vector<program_node*> reorder_node_to_split;
-
- //find the users of reshape that are reorder type, if none present then skip the current node
- for (const auto& user : node->get_users())
- {
- if (user->is_type<reorder>())
- reorder_node_to_split.push_back(user);
- }
-
- if (!reorder_node_to_split.empty())
- {
- auto& prim_node = node->as<reshape>();
- const auto& prim = prim_node.get_primitive();
- auto output_shape = prim->output_shape;
-
- //vector for storing reshape nodes to connect to new reorder nodes (if needed)
- std::vector<program_node*> reorder_reshape_nodes;
-
- bool skip_first_user = false;
- auto reshape_users = node->get_users();
- for (const auto& user : reshape_users)
- {
- //reshape node for first user will be the orginal reshape from the graph
- if (!skip_first_user)
- {
- if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
- reorder_reshape_nodes.push_back(node);
- skip_first_user = true;
- continue;
- }
-
- //other reshapes will be clones of the orginal one connected to reshape->reorder sequences
- if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
- {
- auto new_reshape = std::make_shared<reshape>("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape);
- auto& new_reshape_node = get_or_create(new_reshape);
- add_connection(input_node, new_reshape_node);
- user->replace_dependency(0, new_reshape_node);
- new_reshape_node.processing_itr = processing_order.insert(std::next(input_node.processing_itr), &new_reshape_node);
- reorder_reshape_nodes.push_back(&new_reshape_node);
- }
- }
-
- //add new reorder nodes to proper reshape node
- auto reshape_reorder_id = 0;
- for (const auto& reorder_node : reorder_node_to_split)
- {
- auto& reorder_reshape_node = reorder_reshape_nodes[reshape_reorder_id];
- auto reshape_in_layout = reorder_node->get_output_layout();
- auto reshape_input = std::make_shared<reorder>("_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(), input_node.id(), reshape_in_layout.format, reshape_in_layout.data_type);
- auto& reshape_input_node = get_or_create(reshape_input);
- add_intermediate(reshape_input_node, *reorder_reshape_node, 0, reshape_input_node.dependencies.empty());
- reshape_reorder_id++;
- }
- }
-
- auto reshape_layout = node->get_output_layout();
- if (!(node->is_output()) && (reshape_layout.format != cldnn::format::bfyx))
- {
- auto bfyx_layout = layout({ reshape_layout.data_type, cldnn::format::bfyx, reshape_layout.size });
- //when some primitive does an implicit reorder to some other format then we lose the info about pitches in reshape stage
- //we assume user provides the input vector in bfyx
- if (!are_layouts_identical(reshape_layout, bfyx_layout).second)
- {
- auto reshape_input = std::make_shared<reorder>("_reshape_input_" + node->id(), input_node.id(), cldnn::format::bfyx, reshape_layout.data_type);
- auto& reshape_input_node = get_or_create(reshape_input);
- add_intermediate(reshape_input_node, *node, 0, reshape_input_node.dependencies.empty());
-
- auto reshape_users = node->get_users();
- for (const auto& user : reshape_users)
- {
- size_t idx = 0;
- for (size_t i = 0; i < user->get_dependencies().size(); i++)
- {
- auto& input = user->get_dependency(i);
- if (input.id() == node->id()) {
- idx = i;
- break;
- }
- }
- auto reshape_output = std::make_shared<reorder>("_reshape_output_" + node->id(), user->id(), reshape_layout.format, reshape_layout.data_type);
- auto& reshape_output_node = get_or_create(reshape_output);
- add_intermediate(reshape_output_node, *user, idx, reshape_output_node.dependencies.empty());
- }
- }
- }
- }
- }
-}
-
-//function which prepares given primitive for weights optimization
-template <typename T>
-void program_impl::optimize_weights(T& node, layout_optimizer& lo)
-{
- auto weights_offset = node.get_primitive()->input.size();
- auto bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size();
- for (auto i = weights_offset; i < bias_offset; i++)
- {
- auto& weights = node.get_dependency(i);
- auto* impl = node.get_selected_impl().get();
- auto output_layout = node.get_output_layout();
- auto& weights_node = node.get_dependency(1);
- auto weights_layout = weights_node.get_output_layout();
- const auto weights_type = layout_optimizer::data_type::weights;
-
- auto reorders = lo.get_generic_layer(
- impl->_weights_reorder_params,
- weights.id(),
- weights_layout,
- weights_type);
-
- for (auto& reorder : reorders)
- {
- //insert new generic_layer node to topology
- this->add_intermediate(reorder.first, node, i, !reorder.second);
- //set generic_layer's node output layout and implementation
- auto& g_node = node.get_dependency(i);
- g_node.get_output_layout(false);
- g_node.selected_impl = g_node.type()->choose_impl(*engine, g_node);
- }
- //set the old output layout and do not invalidate users as change of weights will not affect output layout
- node.set_output_layout(output_layout, false);
- }
-}
-template void program_impl::optimize_weights<convolution_node>(convolution_node& node, layout_optimizer& lo);
-template void program_impl::optimize_weights<deconvolution_node>(deconvolution_node& node, layout_optimizer& lo);
-template void program_impl::optimize_weights<fully_connected_node>(fully_connected_node& node, layout_optimizer& lo);
-
-void program_impl::post_optimize_weights(layout_optimizer& lo)
-{
- for (auto& p : nodes_map)
- {
- auto& prim = *p.second;
- if (prim.type() == convolution::type_id())
- {
- optimize_weights(prim.as<convolution>(), lo);
- }
- else if (prim.type() == deconvolution::type_id())
- {
- optimize_weights(prim.as<deconvolution>(), lo);
- }
- else if (prim.type() == fully_connected::type_id())
- {
- optimize_weights(prim.as<fully_connected>(), lo);
- }
- //else if (prim.type() == lstm_gemm::type_id())//TODO: Enable postoptimize weights for lstm
- //{
- // prep_opt(prim.as<lstm_gemm>()); //we should take care of weights and reccurent
- //}
- }
-}
-
-template <typename T>
-void program_impl::optimize_depthwise_sep_post(T& node)
-{
- if (!node.get_depthwise_sep_opt())
- return;
-
- const auto& split = node.get_primitive()->split();
-
- auto dependency_offset = node.get_primitive()->input.size();
- //concatenate weights
- {
- //if weights were optimized it is needed to use the sizes after optimization
- auto target_layout = get_weights_layout(node.get_dependency(dependency_offset), split);
- merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
- dependency_offset++;
- }
-
- //concatenate biases
- if (node.get_primitive()->bias.size() != 0)
- {
- const auto& bias_layout = node.get_dependency(dependency_offset).get_output_layout();
- auto target_layout = layout(bias_layout.data_type, cldnn::format::bfyx, { 1, 1, bias_layout.size.spatial[0] * split, 1 });
- merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
- dependency_offset++;
- }
-
- if (node.template is_type<convolution>())
- {
- auto& prim_node = node.template as<convolution>();
- const auto& prim = prim_node.get_primitive();
-
- // concatenate weights quantization factors
- if (prim->weights_quantization_factors.size() != 0)
- {
- const auto& weights_quantization_layout = node.get_dependency(dependency_offset).get_output_layout();
- auto target_layout = layout(weights_quantization_layout.data_type, cldnn::format::bfyx, { 1, 1, weights_quantization_layout.size.batch[0] * split, 1 });
- merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
- dependency_offset++;
- }
- // concatenate output callibration factors
- if (prim->output_calibration_factors.size() != 0)
- {
- const auto& output_callibration_layout = node.get_dependency(dependency_offset).get_output_layout();
- auto target_layout = layout(output_callibration_layout.data_type, cldnn::format::bfyx, { 1, 1, output_callibration_layout.size.batch[0] * split, 1 });
- merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split);
- dependency_offset++;
- }
- }
-
- if (node.get_primitive())
- //override node split, as only one kernel will be executed
- node.set_split(1);
-}
-template void program_impl::optimize_depthwise_sep_post<convolution_node>(convolution_node& node);
-template void program_impl::optimize_depthwise_sep_post<deconvolution_node>(deconvolution_node& node);
-
-void program_impl::prep_opt_depthwise_sep_post()
-{
- //depthiwise separated convolution/deconvolution optimization
- for (auto& p : nodes_map)
- {
- auto& prim = *p.second;
- if (prim.type() == convolution::type_id())
- {
- optimize_depthwise_sep_post(prim.as<convolution>());
- }
- else if (prim.type() == deconvolution::type_id())
- {
- optimize_depthwise_sep_post(prim.as<deconvolution>());
- }
- }
-}
-
-void program_impl::apply_needed_padding(program_node& node, program_node& prev_node,
- const padding& needed_padding)
-{
- auto target_layout = prev_node.get_output_layout();
-
- // Short circuit if padding did not change.
- if (target_layout.data_padding == needed_padding)
- return;
-
- // Special handling for input nodes.
- if (prev_node.is_type<input_layout>() || prev_node.is_type<mutable_data>())
- {
- target_layout.data_padding = needed_padding;
-
- auto r_prim = std::make_shared<reorder>("reorder_input_" + node.id(), prev_node.id(), target_layout);
- add_intermediate(r_prim, node, 0);
- return;
- }
-
- prev_node.merge_output_padding(needed_padding);
-}
-
-void program_impl::prepare_padding()
-{
- if (output_size_handling_enabled)
- {
- // Prepare upper padding for primitives that support output_size parameter.
- for (const auto& node : processing_order)
- {
- if (node->is_type<convolution>())
- {
- auto& prim_node = node->as<convolution>();
- const auto& prim = prim_node.get_primitive();
-
- if (!prim->with_output_size)
- continue;
-
- auto filter_size = prim_node.weights(0).get_output_layout().size;
-
- auto needed_padding = calc_sliding_window_needed_input_padding(
- prim_node.input().get_output_layout(),
- prim->output_size, filter_size, prim->input_offset, prim->stride, prim->dilation, false, 1);
- apply_needed_padding(prim_node, prim_node.input(), needed_padding);
- }
- else if (node->is_type<deconvolution>())
- {
- auto& prim_node = node->as<deconvolution>();
- const auto& prim = prim_node.get_primitive();
-
- if (!prim->with_output_size)
- continue;
-
- auto filter_size = prim_node.weights(0).get_output_layout().size;
-
- auto needed_padding = calc_sliding_window_needed_input_padding(
- prim_node.input().get_output_layout(),
- prim->output_size, filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1);
-
- apply_needed_padding(prim_node, prim_node.input(), needed_padding);
- }
- else if (node->is_type<pooling>())
- {
- auto& prim_node = node->as<pooling>();
- const auto& prim = prim_node.get_primitive();
-
- if (!prim->with_output_size)
- continue;
-
- // NOTE: Currently there is no pooling implementation/pooling mode which does not check input data range.
- // There is no need to add padding requirements on pooling inputs.
- //auto needed_padding = calc_sliding_window_needed_input_padding(
- // prim_node.input().get_output_layout(),
- // prim->output_size, prim->size, prim->input_offset, prim->stride, {1, 1, 1, 1}, false, 1);
- auto needed_padding = prim_node.input().get_output_layout().data_padding;
-
- apply_needed_padding(prim_node, prim_node.input(), needed_padding);
- }
- }
- }
-
- // Prepare optimized padding for bfyx convolution.
- for (auto& pair : nodes_map)
- {
- if (pair.second->type() != convolution::type_id())
- continue;
-
- auto& node = pair.second->as<convolution>();
- if (node.get_dependencies().empty())
- continue;
-
- auto conv = node.get_primitive();
- auto& conv_input_node = node.get_dependency(0);
- auto conv_layout = node.get_output_layout();
-
- // right now output padding optimization is only available for bfyx format and data type = float32
- if (conv_layout.format != cldnn::format::bfyx
- && conv_layout.format != cldnn::format::bf8_xy16
- && conv_layout.format != cldnn::format::byxf_af32
- && conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32)
- {
- continue;
- }
-
- // Calculating input padding needed for convolution
- auto& filter_node = node.as<convolution>().weights(0);
- auto filter_prim = filter_node.get_primitive();
-
- layout filter_layout = filter_node.get_output_layout();
-
- // convolution have only one input primitive
- auto prev_prim_output_layout = conv_input_node.get_output_layout();
-
- // Compute initial required paddings for primitive used as input for convolution.
- auto input_offset = conv->input_offset;
- auto stride = conv->stride;
- auto dilation = conv->dilation;
-
- auto input_limit_x = input_offset.spatial[0] + (conv_layout.size.spatial[0] - 1) * stride.spatial[0] + (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
- auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] + (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
-
- auto left_padding = std::max(-input_offset.spatial[0], 0);
- auto top_padding = std::max(-input_offset.spatial[1], 0);
- auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
- auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
-
- // Adjust right padding, so entire buffer size in X dimension is properly aligned.
- // TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
- //auto needed_buffer_size_x = static_cast<cldnn::tensor::value_type>(
- // round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
- //right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
-
- cldnn::padding needed_padding({ 0, 0, left_padding, top_padding }, { 0, 0, right_padding, bottom_padding }, 0);
- needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
-
- apply_needed_padding(node, conv_input_node, needed_padding);
- }
-}
-
-void program_impl::propagate_constants()
-{
- constants_propagator prop(this);
-
- for (auto& node : processing_order)
- prop.visit_node(*node);
-
- auto&& to_replace = prop.calculate();
-
- //remove all nodes which are no longer relevant, i.e. nodes which:
- // 1. are constants, and
- // 2. do not have non-const user (so their data are not used during inference), and
- // 3. are not marked as outputs.
- // in case if node has either non-const user or is marked as output, it should be replace with cldnn::data rather than removed (see next loop)
- auto proc_itr = processing_order.begin();
- while (proc_itr != processing_order.end())
- {
- auto& node = (*proc_itr++);
- if (!node->is_constant())
- continue;
- if (node->has_non_const_user() || (node->is_output() && !node->is_type<data>()))
- continue;
-
- auto& users = node->users;
- auto& deps = node->dependencies;
-
- for (size_t idx = 0; idx < deps.size(); idx++)
- {
- deps.at(idx)->users.remove(node);
- }
- deps.clear();
-
- for (auto& usr : users) {
- auto& usr_deps = usr->dependencies;
- usr_deps.erase(std::remove(usr_deps.begin(), usr_deps.end(), node), usr_deps.end());
- }
- users.clear();
-
- if (!node->is_output())
- {
- auto rem = remove_if_dangling(*node);
- assert(rem && "Non-output constant node which has only constant users should have been removed during constants propagation pass");
- (void)rem;
- }
- }
-
- //replace all constant nodes which are relevant for inference (either used by non-const user or marked as output) with recomputed cldnn::data
- for (auto& cout : to_replace)
- {
- auto& id_to_replace = cout.first;
-
- //TODO: do not use API primitives internally and get rid of this last 'cldnn::memory' internal usage
- memory api_memory = details::memory_c_to_cpp_converter::convert(api_cast(cout.second.get()));
- //c-cpp converter does not retain since normally it is done inside API-impl layer (cldnn.cpp) so we need to do it manually
- cout.second->add_ref();
-
- auto const_data = std::make_shared<data>("_cldnn_const_prop_" + id_to_replace, api_memory /* <<< REMOVE ME WHEN POSSIBLE */);
- auto& new_node = get_or_create(const_data);
- auto& curr_node = *nodes_map.at(id_to_replace);
-
- if (!curr_node.is_type<generic_layer>())
- {
- auto curr_node_deps = curr_node.get_dependencies();
- for (auto& dep : curr_node_deps)
- {
- auto dep_users = dep->get_users();
- for (auto& dep_user : dep_users)
- {
- if (dep_user == &curr_node)
- remove_connection(*dep, curr_node);
- }
- }
- }
+ if (!get_engine().configuration().enable_memory_pool)
+ return;
- curr_node.dependencies.clear();
- //remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any dependencies)
- curr_node.users.erase(
- std::remove_if(curr_node.users.begin(), curr_node.users.end(), [](program_node* node) { return node->is_constant(); }),
- curr_node.users.end()
- );
- replace(curr_node, new_node, false, false);
- }
+ basic_memory_dependencies();
+ skipped_branch_memory_dependencies();
+ oooq_memory_dependencies();
}
-void program_impl::prepare_buffer_fusing()
+std::string program_impl::get_memory_dependencies_string() const
{
- bool is_debug = options.get<build_option_type::debug>()->enabled();
+ std::string mem_dep = "Memory dependencies/restrictions:\n";
auto itr = processing_order.begin();
while (itr != processing_order.end())
{
- auto& node = (*itr++);
+ auto& node = *itr;
+ itr++;
+ mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: ");
+ for (auto it : node->get_memory_dependencies())
+ mem_dep == mem_dep.append(it).append(", ");
+ mem_dep = mem_dep.append("\n");
+ }
+ return mem_dep;
+}
- // TODO: Move fused activation to previous layer when possible
- if (node->fused_activation.activation_func != cldnn_activation_func_t::activation_none)
- continue;
+void program_impl::handle_reshape()
+{
+ //reshape primitive by definition does not change underlying data, only shape description
+ //however during graph initialization and data optimization the layouts can be changed without user's knowledge,
+ //when reshape is followed by reorder, it is likely that reorder's output will not be as expected (for example reshape with flattened shape)
+ //this pass resolved the issue by changing graph in the following way
+ //- in case reshape has multiple users with reshape->reorder sequence, it will be splitted to multiple reshape primitives with single user
+ //- in case of reshape->reorder sequence, the additional reorder before reshape will be added,
+ // if last reorder does not contain padding or mean subtract, it will be removed later in the graph
- do_for_types<concatenation>(*node, [this, is_debug](concatenation_node& node)
+ for (const auto& node : processing_order)
+ {
+ if (node->is_type<reshape>())
{
- // buffer fusing should not be performed if one of inputs produces padded output since
- // it could break desired memory alignment. On the other hand, if this node uses all inputs
- // exclusively (see check above) they should not have output padding set since concatenation
- // does not ask for any.
- if (node.has_padded_dependency())
- return;
+ auto& input_node = node->get_dependency(0);
- auto concat_axis = node.get_primitive()->axis;
- auto padd = node.get_output_layout().data_padding;
+ if (input_node.is_type<reorder>())
+ continue;
+
+ node->get_output_layout();
+ if (node->as<reshape>().is_in_place())
+ node->optimized = true;
- tensor lower_padd = padd.lower_size();
- tensor upper_padd = padd.upper_size();
+ //vector for storing nodes that are reorder type, for which splitted primitives are needed (except for the first one where orginal reshape will be used)
+ std::vector<program_node*> reorder_node_to_split;
- auto upper_padd_val = node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis];
- tensor lower_padd_offset = lower_padd;
+ //find the users of reshape that are reorder type, if none present then skip the current node
+ for (const auto& user : node->get_users())
+ {
+ if (user->is_type<reorder>())
+ reorder_node_to_split.push_back(user);
+ }
- std::list<std::pair<const std::vector<program_node*>, tensor>> stack = { std::make_pair(node.get_dependencies(), tensor{ 0, 0, 0, 0 }) };
- while (!stack.empty())
+ if (!reorder_node_to_split.empty())
{
- auto nodes_list = stack.front();
- stack.pop_front();
+ auto& prim_node = node->as<reshape>();
+ const auto& prim = prim_node.get_primitive();
+ auto output_shape = prim->output_shape;
- auto cascade_adjustment = nodes_list.second;
- upper_padd.raw[concat_axis] = upper_padd_val;
- lower_padd = lower_padd_offset;
+ //vector for storing reshape nodes to connect to new reorder nodes (if needed)
+ std::vector<program_node*> reorder_reshape_nodes;
- //check if concatenation in place can be applied for inputs set
- for (auto input : nodes_list.first)
+ bool skip_first_user = false;
+ auto reshape_users = node->get_users();
+ for (const auto& user : reshape_users)
{
- //if any of this node's inputs is used by more than one primitive and is not optimized concatenation then do not fuse buffers,
- //also, if an input is marked as network output, prevent optimizations which would affect a form of its output (unless debug flag is set)
- // todo: in future, if this case is problem, it can be optimized further to enable buffer fusing
- // per single input rather than all/none
- // + restrict input types to pooling, convolution and activation only due to problems with output padding on b and f
- if ((!input->is_type<pooling>() && !input->is_type<convolution>() && !input->is_type<activation>() && !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>()) ||
- (input->is_output() && !is_debug) ||
- input->get_users().size() > 2)
- return;
-
- if (input->get_users().size() > 1)
+ //reshape node for first user will be the orginal reshape from the graph
+ if (!skip_first_user)
{
- auto user_count = input->get_users().size();
- for (auto& user : input->get_users())
- if (user->is_type<concatenation>())
- user_count--;
- if (user_count > 1)
- return;
+ if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
+ reorder_reshape_nodes.push_back(node);
+ skip_first_user = true;
+ continue;
}
- //check only for spatial paddings. Accept feature and batch
- if (input->get_output_layout().data_padding.lower_size().spatial[0] != 0 ||
- input->get_output_layout().data_padding.upper_size().spatial[0] != 0 ||
- input->get_output_layout().data_padding.lower_size().spatial[1] != 0 ||
- input->get_output_layout().data_padding.upper_size().spatial[1] != 0)
- return;
- }
-
- //apply concatenation in place optimization
- for (auto input : nodes_list.first)
- {
- auto input_lenght = input->get_output_layout().size.raw[concat_axis];
-
- // shrink upper pad so it points at the end of the input's buffer
- //
- // |--- lower padd ---| |---------- upper padd -----------|
- // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
- upper_padd.raw[concat_axis] -= input_lenght;
-
- //adjust padding sizes for cascade concatenations
- auto lower_padd_tmp = lower_padd;
- lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis];
- auto upper_padd_tmp = upper_padd;
- upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis];
-
- // set new padding for input
- input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes()));
-
- // move lower padd further
- //
- // |-------------- lower padd -------------|---------- upper padd -----------|
- // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
-
- lower_padd.raw[concat_axis] += input_lenght;
-
- if (input->type() == concatenation::type_id() && input->can_be_optimized())
+ //other reshapes will be clones of the orginal one connected to reshape->reorder sequences
+ if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end())
{
- if (input->as<concatenation>().get_primitive()->axis != node.get_primitive()->axis)
- return;
-
- if (!input->get_dependencies().empty())
- stack.push_back(std::make_pair(input->get_dependencies(), input->get_output_layout().data_padding.lower_size()));
+ auto new_reshape = std::make_shared<reshape>("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape);
+ auto& new_reshape_node = get_or_create(new_reshape);
+ add_connection(input_node, new_reshape_node);
+ user->replace_dependency(0, new_reshape_node);
+ processing_order.insert_next(&input_node, &new_reshape_node);
+ reorder_reshape_nodes.push_back(&new_reshape_node);
}
}
- }
-
- node.can_be_optimized(true);
- });
-
- // zero copy
- do_for_types<crop>(*node, [this, is_debug](crop_node& node)
- {
- //if the node is marked as network output, prevent optimizations which would affect a form of its output, unless debug flag is set
- if (node.is_output() && !is_debug)
- return;
-
- //do not optimize when next node is concatenation which is not output
- if (node.get_users().size() == 1 && node.get_users().front()->is_type<concatenation>() && !node.get_users().front()->is_output())
- return;
- if (node.get_dependencies().size() == 1 &&
- node.get_users().size() > 0)
- {
- // optimization is avaiable for croping across depth(features) only
- // if output padding has defined padding accross featuers already it wouldn't
- // work because it expect to have zeros in the padded area.
- auto format = node.get_output_layout().format;
- auto crop_prim = node.get_primitive();
- auto input_layout = node.get_dependency(0).get_output_layout();
- auto out_padd = node.get_output_layout().data_padding;
- if (format == format::bfyx &&
- crop_prim->reference_input.batch[0] == input_layout.size.batch[0] &&
- crop_prim->reference_input.spatial[0] == input_layout.size.spatial[0] &&
- crop_prim->reference_input.spatial[1] == input_layout.size.spatial[1] &&
- out_padd.lower_size().feature[0] == 0 &&
- out_padd.upper_size().feature[0] == 0 &&
- out_padd.lower_size().batch[0] == 0 &&
- out_padd.upper_size().batch[0] == 0 &&
- out_padd.lower_size().spatial[0] == 0 &&
- out_padd.lower_size().spatial[1] == 0 &&
- out_padd.upper_size().spatial[0] == 0 &&
- out_padd.upper_size().spatial[1] == 0)
+ //add new reorder nodes to proper reshape node
+ auto reshape_reorder_id = 0;
+ for (const auto& reorder_node : reorder_node_to_split)
{
- // Regular crop
- // crop input buffer
- // |___________data____________|
- //
- // crop output buffer
- // |-------->| offsets[f] |<--|
- // |_____data____|
- // <------------>
- // reference size
- //
- // Inplace crop
- // crop output buffer
- // |_low_pad_|__data_size__|___|<-upper pad
-
- node.set_output_padding(padding(
- { out_padd.lower_size().batch[0], crop_prim->offsets.feature[0], out_padd.lower_size().spatial[0], out_padd.lower_size().spatial[1] },
- { out_padd.upper_size().batch[0], input_layout.size.feature[0] - crop_prim->offsets.feature[0] - crop_prim->reference_input.feature[0],
- out_padd.upper_size().spatial[0], out_padd.upper_size().spatial[1] }));
- node.can_be_optimized(true);
+ /*
+ auto new_reshape = std::make_shared<reshape>("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape);
+ auto& new_reshape_node = get_or_create(new_reshape);
+ add_connection(input_node, new_reshape_node);
+ user->replace_dependency(0, new_reshape_node);
+ processing_order.insert(std::next(processing_order.get_processing_iterator(input_node)), &new_reshape_node);
+ reorder_reshape_nodes.push_back(&new_reshape_node);
+ */
+ auto& reorder_reshape_node = reorder_reshape_nodes[reshape_reorder_id];
+ auto reshape_in_layout = reorder_node->get_output_layout();
+ auto reshape_input = std::make_shared<reorder>("_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(), input_node.id(),
+ reshape_in_layout.format, reshape_in_layout.data_type);
+ auto& reshape_input_node = get_or_create(reshape_input);
+ add_intermediate(reshape_input_node, *reorder_reshape_node, 0, reshape_input_node.dependencies.empty());
+ reshape_reorder_id++;
}
}
- });
- do_for_types<reshape>(*node, [this](reshape_node& node)
- {
- node.get_output_layout();
- if (node.is_in_place() && node.get_fused_activation_func() == activation_none)
- node.can_be_optimized(true);
- });
- do_for_types<reorder>(*node, [this](reorder_node& node)
- {
- auto& input = node.input();
- auto output_layout = node.get_output_layout();
- //This is WA for topologies that due to additional reorders added perform worse with conv1x1 optimization
- auto remove_bf8_xy_opt = ((input.is_type<pooling>() || input.is_type<concatenation>()) &&
- output_layout.format == format::bf8_xy16 && input.get_users().size() == 1);
- //Remove reorder from convolution 1x1 to bfyx in some conditions
- auto remove_byxf_opt = (input.is_type<convolution>() &&
- input.get_users().size() == 1 &&
- input.get_output_layout().format == format::byxf);
- //check if all inputs user have the same format
- auto all_users_same_format = true;
- auto input_user_layout_format = input.get_users().front()->get_output_layout().format;
- for (auto const& user : input.get_users())
+ auto reshape_layout = node->get_output_layout();
+ if (!(node->is_output()) && (reshape_layout.format != cldnn::format::bfyx))
{
- if (user->get_output_layout().format != input_user_layout_format)
+ auto bfyx_layout = layout({ reshape_layout.data_type, cldnn::format::bfyx, reshape_layout.size });
+ //when some primitive does an implicit reorder to some other format then we lose the info about pitches in reshape stage
+ //we assume user provides the input vector in bfyx
+ if (!program_helpers::are_layouts_identical(reshape_layout, bfyx_layout).second)
{
- all_users_same_format = false;
- break;
- }
- }
- auto same_data_type = input.get_output_layout().data_type == output_layout.data_type;
- //Optimization only available in case of layers that support different input and output formats.
- //todo: new api needs to be created to read such caps
- if (!(input.is_type<pooling>() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) &&
- !remove_bf8_xy_opt &&
- !(input.is_type<convolution>() && input.get_output_layout().format == format::bf8_xy16) &&
- !(input.is_type<eltwise>() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) &&
- !(remove_byxf_opt && (node.get_users().front()->is_type<eltwise>() || node.get_users().front()->is_type<pooling>())))
- return;
-
- if (remove_bf8_xy_opt)
- {
- auto users_user_layout = node.get_users().front()->get_users().front()->get_output_layout();
- // if users_user_layout is still bf8_yx16 (stacked convolutions) then leave the reorder
- if (users_user_layout.format == format::bf8_xy16)
- return;
- auto input_layout = input.get_output_layout();
- auto target_layout = layout(input_layout.data_type, users_user_layout.format, input_layout.size, input_layout.data_padding);
- input.set_output_layout(target_layout, false);
- }
- else if (remove_byxf_opt)
- {
- auto user = node.get_users().front();
- auto users_users = node.get_users().front()->get_users();
+ auto reshape_input = std::make_shared<reorder>("_reshape_input_" + node->id(), input_node.id(), cldnn::format::bfyx, reshape_layout.data_type);
+ auto& reshape_input_node = get_or_create(reshape_input);
+ add_intermediate(reshape_input_node, *node, 0, reshape_input_node.dependencies.empty());
- for (auto const& users_user : users_users)
- {
- if (users_user->get_output_layout().format != format::byxf && !users_user->is_type<eltwise>())
+ auto reshape_users = node->get_users();
+ for (const auto& user : reshape_users)
{
- remove_byxf_opt = false;
- break;
+ size_t idx = 0;
+ for (size_t i = 0; i < user->get_dependencies().size(); i++)
+ {
+ auto& input = user->get_dependency(i);
+ if (input.id() == node->id()) {
+ idx = i;
+ break;
+ }
+ }
+ auto reshape_output = std::make_shared<reorder>("_reshape_output_" + node->id(), user->id(), reshape_layout.format, reshape_layout.data_type);
+ auto& reshape_output_node = get_or_create(reshape_output);
+ add_intermediate(reshape_output_node, *user, idx, reshape_output_node.dependencies.empty());
}
}
-
- if (remove_byxf_opt)
- {
- auto input_layout = input.get_output_layout();
- user->set_output_layout(input_layout, false);
- }
}
- else
- input.set_output_layout(output_layout, false);
-
- node.can_be_optimized(true);
- extract_and_remove(node); //try to remove redundant reorders
- });
+ }
}
}
-void program_impl::fuse_skip_layers(program_node* node)
+void program_impl::apply_needed_padding(program_node& node, program_node& prev_node,
+ const padding& needed_padding)
{
- do_for_types<eltwise>(*node, [this](eltwise_node& node)
- {
- bool skippable = false;
- int index = 0;
- if (node.get_primitive()->mode != eltwise_mode::sum || node.inputs_count() != 2)
- return;
-
- if (node.input(0).is_type<deconvolution>())
- {
- skippable = true;
- }
- else if (node.input(1).is_type<deconvolution>())
- {
- skippable = true;
- index = 1;
- }
-
- if (!skippable)
- return;
-
- auto& to_fuse_with = node.input(index).as<deconvolution>();
- int to_fuse_index = index == 0 ? 1 : 0;
-
- // check that node doesn't have fused eltwise already
- if (to_fuse_with.has_fused_sum())
- return;
-
- //remove dependencies and users of elwtise that is going to be extracted
- add_connection(node.input(to_fuse_index), to_fuse_with);
- remove_connection(node.input(to_fuse_index), node);
-
- //replace processing_num of the node where fusing take place and eltwise
- auto new_processing_num = node.processing_num;
- processing_order.erase(to_fuse_with.processing_itr);
- to_fuse_with.processing_itr = processing_order.insert(node.processing_itr, &to_fuse_with);
- to_fuse_with.processing_num = new_processing_num;
-
- //make sure that new fused node's users have higher processing_num than fused node
- for (auto user : to_fuse_with.get_users())
- {
- if (user->processing_num < new_processing_num)
- {
- processing_order.erase(user->processing_itr);
- user->processing_itr = processing_order.insert(std::next(to_fuse_with.processing_itr), user);
- user->processing_num = new_processing_num + 1;
- }
- }
-
- if (node.get_fused_activation_func() != activation_none)
- to_fuse_with.set_fused_activation(node.get_fused_activation_func(), node.get_fused_activation_params());
- to_fuse_with.set_output_padding(node.get_output_layout().data_padding);
-
- extract_and_remove(node);
- });
-}
+ auto target_layout = prev_node.get_output_layout();
-void program_impl::prepare_primitive_fusing()
-{
- bool is_debug = options.get<build_option_type::debug>()->enabled();
+ // Short circuit if padding did not change.
+ if (target_layout.data_padding == needed_padding)
+ return;
- auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed
- while (itr != processing_order.end())
+ // Special handling for input nodes.
+ if (prev_node.is_type<input_layout>() || prev_node.is_type<mutable_data>())
{
- auto node_itr = itr++;
- auto& node = (*node_itr);
-
- do_for_types<activation>(*node, [this, is_debug](activation_node& node)
- {
-
- auto& input = node.input();
-
- //Restrictions:
- // - inputs cannot be padded
- // - primitives input cannot be output
- // - no activation additional input
- // - input was optimized
- if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() ||
- node.get_dependencies().size() != 1 || input.can_be_optimized())
- return;
-
- // - check if there is no activation fused already
- // - limit to primitives which implementations support activation fusing
- if (input.get_users().size() != 1 || input.get_fused_activation_func() != activation_none ||
- //TODO: new api needs to be created to read such caps
- //right now use whitelist so no new primitives will be affected in case of lack of fused activation support
- (!input.is_type<batch_norm>() && !input.is_type<concatenation>() && !input.is_type<convolution>() &&
- !input.is_type<crop>() && !input.is_type<deconvolution>() && !input.is_type<eltwise>() &&
- !input.is_type<fully_connected>() && !input.is_type<lrn>() && !input.is_type<normalize>() &&
- !input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
- !input.is_type<roi_pooling>() && !input.is_type<scale>() &&
- !input.is_type<softmax>() && !input.is_type<upsampling>() && !input.is_type<mvn>()))
- return;
-
- input.set_fused_activation(node.get_primitive()->activation_func, node.get_primitive()->additional_params);
- input.set_output_padding(node.get_output_layout().data_padding);
+ target_layout.data_padding = needed_padding;
- extract_and_remove(node);
- });
+ auto r_prim = std::make_shared<reorder>("reorder_input_" + node.id(), prev_node.id(), target_layout);
+ add_intermediate(r_prim, node, 0);
+ return;
}
- //Second loop tries fusing several reorders one by one (if present) into one reorder
- itr = processing_order.begin();
- while (itr != processing_order.end())
- {
- auto node_itr = itr++;
- auto& node = (*node_itr);
-
- do_for_types<reorder>(*node, [this, is_debug](reorder_node& node)
- {
- auto& input = node.input();
-
- //Restrictions:
- // - inputs cannot be padded
- // - primitives input cannot be output
- // - input was optimized
- if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.get_dependencies().size() != 1 ||
- input.can_be_optimized())
- return;
-
- // - check if previous node is reorder with 1 user
- // - do not fuse if current node has mean subtract
- if (input.get_users().size() != 1 || !input.is_type<reorder>() ||
- node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
- return;
+ prev_node.merge_output_padding(needed_padding);
+}
- input.set_output_layout(node.get_output_layout(), false);
- extract_and_remove(node);
- });
- }
- //Third loop tries fusing eltwise (sum) with deconvolution
- itr = processing_order.begin();
- while (itr != processing_order.end())
+void program_impl::reverse_connection(program_node& dep_node, program_node& user_node)
+{
+ if (std::find(dep_node.users.begin(), dep_node.users.end(), &user_node) != dep_node.users.end())
{
- auto node_itr = itr++;
- auto& node = (*node_itr);
-
- fuse_skip_layers(node);
+ remove_connection(dep_node, user_node);
+ add_connection(user_node, dep_node);
}
+ else
+ throw std::runtime_error("Trying to reverse connection, but nodes are wrongly or not connected.");
}
program_node& program_impl::get_or_create(std::shared_ptr<primitive> prim)
return *itr->second;
auto new_node = prim->type->create_node(*this, prim);
- new_node->set_org_primitive_id(new_node->id());
nodes_map.insert(itr, { prim->id, new_node });
return *new_node;
}
-void program_impl::add_intermediate(program_node& node, program_node& next, size_t prev_idx, bool connect_int_node_with_old_dep)
+void program_impl::add_intermediate(program_node& node, program_node& next, size_t prev_idx,
+ bool connect_int_node_with_old_dep, bool move_usrs_of_prev_to_node)
{
if (connect_int_node_with_old_dep && !node.dependencies.empty())
- throw std::invalid_argument("Node which is about to be added inbetween two other nodes should not have any existing dependencies");
+ throw std::invalid_argument("Node which is about to be added in between two other nodes should not have any existing dependencies");
auto& prev = next.get_dependency(prev_idx);
//firstly add connection, later replace dependency, so 'prev' won't become dangling and therefore removed
if (connect_int_node_with_old_dep)
{
add_connection(prev, node);
- if (node.processing_itr != processing_order.end())
- processing_order.erase(node.processing_itr);
+ if (processing_order.size() != 0)
+ {
+ processing_order.insert_next(&prev, &node);
+ }
+ }
- auto itr = prev.processing_itr;
- node.processing_itr = processing_order.insert(++itr, &node);
- node.processing_num = prev.processing_num;
+ if (move_usrs_of_prev_to_node) {
+ auto itr = prev.get_users().begin();
+ while(itr!= prev.get_users().end())
+ {
+ auto usr = *itr;
+ itr++;
+ if (usr->id() != node.id())
+ usr->replace_dependency(prev, node);
+ }
+ mark_if_constant(prev);
+ mark_if_constant(node);
+ mark_if_data_flow(prev);
+ mark_if_data_flow(node);
+ }
+ else {
+ next.replace_dependency(prev_idx, node);
+ node.constant = prev.constant;
+ node.data_flow = prev.data_flow;
}
+}
+
+void program_impl::add_intermediate(std::shared_ptr<primitive> prim, program_node& next, size_t prev_idx,
+ bool connect_int_node_with_old_dep, bool move_usrs_of_prev_to_node)
+{
+ add_intermediate(get_or_create(prim), next, prev_idx, connect_int_node_with_old_dep, move_usrs_of_prev_to_node);
+}
- next.replace_dependency(prev_idx, node);
- node.constant = prev.constant;
- node.data_flow = prev.data_flow;
- if (prev.constant_frontier)
+void program_impl::add_connection(program_node& prev, program_node& next)
+{
+ prev.users.push_back(&next);
+ next.dependencies.push_back(&prev);
+}
+
+void program_impl::remove_connection(program_node& prev, program_node& next)
+{
+ prev.users.remove(&next);
+ next.dependencies.erase(std::remove(next.dependencies.begin(), next.dependencies.end(), &prev), next.dependencies.end());
+}
+
+void program_impl::remove_all_connections(program_node& node) {
+ // since the graph is not topological sorted, we need to remove the node from both dependencies and users
+ for (auto &e : node.users)
+ {
+ e->dependencies.erase(std::remove(e->dependencies.begin(), e->dependencies.end(), &node), e->dependencies.end());
+ }
+ for (auto &e : node.dependencies)
{
- node.constant_frontier = true;
- prev.constant_frontier = false;
+ e->users.remove(&node);
}
+ node.dependencies.clear();
+ node.users.clear();
}
void program_impl::rename(program_node & node, primitive_id const & new_id)
}
}
-void program_impl::replace(program_node& old_node, program_node& new_node, bool replace_whole_branch, bool check_output_layouts_integrity)
+void program_impl::replace(program_node& old_node, program_node& new_node)
{
- if ((!new_node.dependencies.empty() && !replace_whole_branch) || !new_node.users.empty())
+ if (!new_node.dependencies.empty() || !new_node.users.empty())
throw std::invalid_argument("Node which is about to replace other node should be detached");
if (new_node.is_output())
new_node.output_layout = old_node.get_output_layout();
new_node.valid_output_layout = old_node.valid_output_layout;
- if (!replace_whole_branch)
+
+ //copy old's dependencies
+ while (!old_node.dependencies.empty())
{
- //copy old's dependencies
- while (!old_node.dependencies.empty())
- {
- auto& dep = old_node.dependencies.back();
- add_connection(*dep, new_node);
- remove_connection(*dep, old_node);
- }
+ auto& dep = old_node.dependencies.front();
+ add_connection(*dep, new_node);
+ remove_connection(*dep, old_node);
}
//append users
old_node.users.clear();
- if (check_output_layouts_integrity && new_node.valid_output_layout)
- new_node.recalc_output_layout();
-
bool old_was_output = false;
//copy node's state
if (old_node.is_output())
inputs.remove(&old_node);
new_node.constant = old_node.constant;
- new_node.constant_frontier = old_node.constant_frontier;
new_node.user_mark = old_node.user_mark;
- auto old_news_pos = new_node.processing_itr;
- new_node.processing_itr = processing_order.insert(old_node.processing_itr, &new_node);
- new_node.processing_num = old_node.processing_num;
- if (old_news_pos != processing_order.end())
- processing_order.erase(old_news_pos);
- if (old_node.processing_itr != processing_order.end())
- processing_order.erase(old_node.processing_itr);
-
+ processing_order.insert(&old_node, &new_node);
+ if (processing_order.get_processing_iterator(old_node) != processing_order.end())
+ processing_order.erase(&old_node);
nodes_map.erase(id);
rename(new_node, id);
}
}
-bool program_impl::remove_if_dangling(program_node& node, bool detach_whole_branch)
+bool program_impl::remove_if_dangling(program_node& node)
{
if (!node.users.empty())
return false;
- if (!detach_whole_branch && !node.dependencies.empty())
+ if (!node.dependencies.empty())
return false;
- std::list<program_node*> to_remove;
- std::list<program_node*> marked;
- if (detach_whole_branch)
- {
- node.mark();
- std::list<program_node*> queue = { &node };
- while (!queue.empty())
- {
- auto curr = queue.front();
- queue.pop_front();
- marked.push_back(curr);
-
- //remove only if all users also has been marked
- bool rem = !std::any_of(curr->get_users().begin(), curr->get_users().end(), [](program_node* node) { return !node->is_marked(); });
- if (rem)
- to_remove.push_back(curr);
-
- for (auto dep : curr->get_dependencies())
- {
- if (!dep->is_marked())
- {
- dep->mark();
- queue.push_back(dep);
- }
- }
- }
- }
- else
- to_remove.push_back(&node);
-
- for (auto n : marked)
- n->unmark();
-
- for (auto rem : to_remove)
+ if (!node.is_output() || is_debug_build())
{
- if (!rem->is_output() || is_debug_build())
- {
- if (detach_whole_branch)
- {
- for (auto& user : rem->get_users())
- user->remove_dependency(*rem);
- }
- if (rem->is_input())
- inputs.remove(rem);
+ if (node.is_input())
+ inputs.remove(&node);
- if (std::find(processing_order.begin(), processing_order.end(), rem) != processing_order.end())
- processing_order.erase(rem->processing_itr);
- optimized_out.push_back(rem->id());
- nodes_map.erase(rem->id());
- }
+ if (std::find(processing_order.begin(), processing_order.end(), &node) != processing_order.end())
+ processing_order.erase(&node);
+ optimized_out.push_back(node.id());
+ nodes_map.erase(node.id());
}
-
return true;
}
node.dependencies.clear();
input.users.remove(&node);
- if (node.constant_frontier)
- {
- assert(node.constant && "Constant frontier should also, by definition, be constant");
- assert(input.constant && "Input for constant forontier should, by definition, be constant");
- input.constant_frontier = true;
- }
-
if (!node.is_endpoint())
replace_all_usages(node, input);
else
return true;
}
-void program_impl::replace_data_with_optimized(std::map<primitive_id, memory_impl::ptr> const & replace_map)
+void program_impl::remove_nodes(std::list<program_node*>& to_remove)
{
- for (auto& result : replace_map)
+ for (auto const& node : to_remove)
{
- auto& node = *nodes_map.at(result.first);
- assert(node.is_type<data>() && "Optimized primitive is not a cldnn::data");
- assert(result.second != nullptr && "Memory which handles result of optimization should not be nullptr");
- node.as<data>().attach_memory(*result.second, false);
+ if (node->is_input())
+ get_inputs().remove(node);
+ else
+ {
+ for (auto& dep : node->dependencies)
+ dep->users.remove(node);
+ }
+ for (auto& user : node->users)
+ {
+ user->dependencies.erase(std::remove(user->dependencies.begin(),
+ user->dependencies.end(), node),
+ user->dependencies.end());
+ }
+ get_processing_order().erase(node);
+ optimized_out.push_back(node->id());
+ nodes_map.erase(node->id());
}
}
{
return;
}
-
path += "cldnn_memory_pool.log";
auto dep = get_memory_dependencies_string();
get_engine().dump_memory_pool(*this, path, dep);
- dump_program("14_memory_pool", true);
+ std::string dump_file_name = std::to_string(pm->get_pass_count()+1) + "_memory_pool";
+ dump_program(dump_file_name.c_str(), true);
}
//TODO: break this function into number of smaller ones + add per-primitive fields (possibly use primitive_inst::to_string?)
void program_impl::dump_program(const char* stage, bool with_full_info, std::function<bool(program_node const&)> const& filter) const
{
- auto path = get_dir_path(options);
+ std::string path = get_dir_path(options);
if (path.empty())
{
return;
dump_graph_optimized(graph, *this);
}
-//Dumps weights and biasses in serialization process, not working yet, in progress.
-void program_impl::dump_weights_and_biasses(std::vector<unsigned long long>& offsets, std::vector<std::string>& data_names, std::ofstream& file_stream) const
-{
- for (auto const& n : nodes_map)
- {
- auto dependency_count = (unsigned int)n.second.get()->get_dependencies().size();
- for (unsigned int dp = 0; dp < dependency_count; dp++)
- {
- auto& dependency = n.second.get()->get_dependency(dp);
- if (dependency.is_type<data>())
- {
- offsets.push_back(offsets.empty() ? 0ull : offsets.back());
- auto& mem = dependency.as<data>().get_attached_memory();
- if (mem.get_layout().data_type == data_types::f32)
- dump_data(mem, file_stream, offsets.back(), sizeof(float));
- else
- dump_data(mem, file_stream, offsets.back(), sizeof(short));
- data_names.push_back(dependency.as<data>().id());
- }
- }
- }
- file_stream.close();
-}
-
-//Makes serialization with given name.
-//Placeholder, not working yet, in progress.
-void program_impl::serialize(std::string network_name, std::function<bool(program_node const&)> const& filter) const
-{
- std::vector<unsigned long long> offsets;
- std::vector<std::string> data_names;
-
- std::ofstream file_stream(network_name + "_" + "serialization" + ".bin", std::ios::binary);
- dump_kernels(engine->get_context().get()->get_kernels_cache().get_context().get_binaries(), offsets, data_names, file_stream);
- dump_weights_and_biasses(offsets, data_names, file_stream);
- std::ofstream graph(network_name + "_" + "serialization" + ".xml");
- dump_to_xml(graph, *this, filter, offsets, data_names);
-}