2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include "network_impl.h"
19 #include "engine_impl.h"
20 #include "event_impl.h"
21 #include "program_impl.h"
22 #include "api/CPP/data.hpp"
23 #include "api/CPP/mutable_data.hpp"
24 #include "api/CPP/input_layout.hpp"
26 #include "error_handler.h"
27 #include "primitive_inst.h"
28 #include "input_layout_inst.h"
29 #include "condition_inst.h"
30 #include "kernel_selector_helper.h"
33 #include "gpu/ocl_toolkit.h"
36 //#define DEBUG_DUMP_PATH "/tmp/dump/"
39 #ifdef DEBUG_DUMP_PATH
43 #define DUMP_VERBOSE 0
44 #define DUMP_SINGLE_LAYER 0
45 #define DUMP_LAYER_NAME ""
50 #ifdef DEBUG_DUMP_PATH
51 static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false)
53 #if defined HALF_HALF_HPP
56 // FP32 parts extracted from FP16.
57 uint32_t sign = (static_cast<uint16_t>(val) & 0x8000U) << 16;
58 uint32_t mantissa = (static_cast<uint16_t>(val) & 0x3FFU) << 13;
60 uint32_t exp_val_f16 = (static_cast<uint16_t>(val) & 0x7C00U) >> 10;
64 // Handling +/-0 and denormals.
69 else if (flush_denorm_to_zero)
77 // Denorms conversion to normal numbers.
79 while (!(mantissa & 0x400000U))
84 mantissa = (mantissa << 1) & 0x7FFFFFU;
90 // Handling +/-infinity, NaN and normal numbers.
91 exp = (exp_val_f16 == 0x1FU ? 0xFFU : exp_val_f16 + 127 - 15) << 23;
95 reinterpret_cast<uint32_t&>(ret) = sign | exp | mantissa;
101 float convert_element(float f)
106 float convert_element(half_t h)
108 return convert_half_to_float(h);
112 static void dump(memory_impl& mem, std::ofstream& file_stream)
114 auto&& size = mem.get_layout().size;
116 file_stream << "shape: ";
117 file_stream << size.batch[0] << " ";
118 file_stream << size.feature[0] << " ";
119 file_stream << size.spatial[1] << " ";
120 file_stream << size.spatial[0] << " ";
121 file_stream << "(" << size.batch[0] * size.feature[0] * size.spatial[1] * size.spatial[0] << ")" << std::endl;
123 auto mem_ptr = static_cast<T*>(mem.lock());
125 for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b)
127 for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f)
129 for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y)
131 for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x)
133 cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y));
134 size_t input_it = mem.get_layout().get_linear_offset(t);
135 file_stream << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
144 static void log_memory_to_file(memory_impl& mem, std::string layerName)
146 std::string filename = layerName;
147 std::replace(filename.begin(), filename.end(), '\\', '_');
148 std::replace(filename.begin(), filename.end(), '/', '_');
149 std::replace(filename.begin(), filename.end(), ' ', '_');
150 std::replace(filename.begin(), filename.end(), ':', '_');
151 filename = DEBUG_DUMP_PATH + filename + ".txt";
153 std::ofstream file_stream(filename);
154 if (mem.get_layout().data_type == cldnn::data_types::f32)
155 dump<float>(mem, file_stream);
157 dump<half_t>(mem, file_stream);
161 Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants opt pass).
163 network_impl::network_impl(const program_impl& program, bool is_internal)
165 , _internal(is_internal)
167 static std::atomic<uint32_t> id_gen{ 0 };
173 allocate_primitives();
177 validate_primitives();
178 _program->dump_memory_pool();
181 network_impl::network_impl(engine_impl& engine, const topology_impl& topo, const build_options& options, bool is_internal)
182 : network_impl(*engine.build_program(topo, options, is_internal), is_internal)
186 network_impl::network_impl(engine_impl& engine, const std::set<std::shared_ptr<program_node>>& nodes, const build_options& options, bool is_internal)
187 : network_impl(*engine.build_program(nodes, options, is_internal), is_internal)
191 void network_impl::validate_primitives()
193 for (auto const& prim : _exec_order)
195 bool valid = prim->validate();
196 CLDNN_ERROR_NOT_EQUAL(prim->id(), "validate", valid, "", true, "has not a valid instance.");
200 void network_impl::reset_execution(bool wait)
202 if (wait && _events.size() > 0)
204 std::vector<event_impl::ptr> events;
205 for (auto& pair : _events)
207 auto& ev = pair.second;
211 events.push_back(ev);
214 get_engine().wait_for_events(events);
219 void network_impl::set_input_data(const primitive_id& id, memory_impl& data)
221 std::shared_ptr<primitive_inst> primitive_inst;
223 primitive_inst = find_primitive(id);
225 if(primitive_inst == nullptr)
226 throw std::runtime_error("topology doesn't contain prmitive:" + id);
228 if (primitive_inst->type() != input_layout::type_id())
230 CLDNN_ERROR_MESSAGE(id, "primitive " + id + " is not an input");
233 auto input = std::static_pointer_cast<input_layout_inst>(primitive_inst);
235 //Wait for previous execution completion
236 reset_execution(true);
237 input->set_data(data);
240 void cldnn::network_impl::check_names()
242 for (auto const& prim : _primitives)
244 if (find_in_internal_networks(prim.first) != nullptr)
245 CLDNN_ERROR_MESSAGE("Network_impl", "Found primitive with id: " + prim.first
246 + "in anotother network.");
250 std::shared_ptr<primitive_inst> cldnn::network_impl::find_primitive(const primitive_id& id)
252 std::shared_ptr<primitive_inst> ret;
254 if (_primitives.find(id) != _primitives.end())
255 return _primitives.at(id);
257 return find_in_internal_networks(id);
260 std::shared_ptr<primitive_inst> cldnn::network_impl::find_in_internal_networks(const primitive_id& id)
262 std::shared_ptr<primitive_inst> ret;
264 for (auto const& prim : _primitives)
266 if (prim.second->type() == condition::type_id()) //currently only condition inst contains mini networks
268 auto cond_inst = std::static_pointer_cast<condition_inst>(prim.second);
269 ret = cond_inst->get_net_true()->find_primitive(id);
272 ret = cond_inst->get_net_false()->find_primitive(id);
280 void network_impl::set_learning_rate(const float lr)
285 float network_impl::get_learning_rate()
287 return _learning_rate;
290 std::string network_impl::get_primitive_info(const primitive_id& id) const
292 const auto& node = _program->get_node(id);
293 return node.type()->to_string(node);
296 void network_impl::allocate_primitives()
298 std::vector<std::shared_ptr<program_node>> nodes_to_allocate{};
299 for (auto node : _program->get_processing_order())
301 nodes_to_allocate.push_back(_program->get_node_ptr(node->id()));
303 std::sort(nodes_to_allocate.begin(), nodes_to_allocate.end(), [](std::shared_ptr<program_node> const& lhs,
304 std::shared_ptr<program_node> const& rhs)
306 return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
309 for (auto const& node : nodes_to_allocate)
311 allocate_primitive_instance(*node);
315 void network_impl::build_insts_deps()
317 for (auto& inst : _primitives)
319 inst.second->build_deps();
323 void network_impl::build_exec_order()
325 for (auto& node : _program->get_processing_order())
327 if (!node->is_type<data>() &&
328 !(node->is_type<mutable_data>() && node->get_dependencies().empty()))
330 add_to_exec_order(node->id());
334 void network_impl::add_to_exec_order(const primitive_id& id)
336 auto inst = get_primitive(id);
337 _exec_order.push_back(inst);
340 void network_impl::execute(const std::vector<refcounted_obj_ptr<event_impl>>& events)
342 //Wait for previous execution completion
343 reset_execution(false);
345 for (auto& inst : _exec_order)
347 #ifdef DEBUG_DUMP_PATH
348 auto& node = _program->get_node(inst->id());
350 std::string layer_name = node.id();
352 std::cerr << get_primitive_info(inst->id()) << std::endl;
354 #if DUMP_SINGLE_LAYER
355 if (layer_name == DUMP_LAYER_NAME)
358 std::cerr << "Dump " << layer_name << " layer" << std::endl;
359 for (size_t i = 0; i < get_primitive(inst->id())->inputs_memory_count(); i++)
361 log_memory_to_file(get_primitive(inst->id())->input_memory(i), layer_name + "_src_" + std::to_string(i));
365 execute_primitive(inst, events);
366 #ifdef DEBUG_DUMP_PATH
367 #if DUMP_SINGLE_LAYER
368 if (layer_name == DUMP_LAYER_NAME)
371 log_memory_to_file(get_primitive(inst->id())->output_memory(), layer_name + "_dst_0");
373 get_engine().flush_network();
377 for (auto& inst : _program->get_processing_order())
379 //Special handling for mutable data. The event should be the same as the user or dependency with highest processing_num as
380 //the mutable_data can be updated when is both user or dependency.
381 if (inst->is_type<mutable_data>())
383 decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0;
384 for (auto& user : inst->get_users())
386 auto user_proc_num = _program->get_processing_order().get_processing_number(user);
387 if (user_proc_num > proc_num)
389 _events[inst->id()] = _events[user->id()];
390 proc_num = user_proc_num;
394 if (!inst->get_dependencies().empty())
396 for (auto& dep : inst->get_dependencies())
398 auto dep_proc_num = _program->get_processing_order().get_processing_number(dep);
399 if (dep_proc_num > proc_num)
401 _events[inst->id()] = _events[dep->id()];
402 proc_num = dep_proc_num;
409 for (auto& dout : _data_outputs) //data primitives are not executed so if they are marked as output we need to add them valid events manually
411 _events[dout->id()] = get_engine().create_user_event(true);
414 for (auto& prim : _primitives)
416 prim.second->reset_output_change();
419 get_engine().get_context()->reset_events();
421 // Using output of previouse network as input to another one may cause hazard (in OOOQ mode) if user would not
422 // provide proper event to execution. Flushing pipeline should prevent this kind of issues.
423 // In scenarios with a big number of very small networks it can provide performance drop.
424 get_engine().flush_network();
427 std::vector<primitive_id> network_impl::get_output_ids() const
429 std::vector<primitive_id> ret;
430 ret.reserve(_outputs.size());
431 for (auto const& output : _outputs)
432 ret.push_back(output->id());
436 std::vector<primitive_id> network_impl::get_executed_primitive_ids() const
438 std::vector<primitive_id> ret;
439 ret.reserve(_exec_order.size());
440 for (auto const& executed_primitive : _exec_order)
442 ret.push_back(executed_primitive->id());
447 std::vector<primitive_id> network_impl::get_all_primitive_ids() const
449 std::vector<primitive_id> ret;
450 ret.reserve(_primitives.size());
451 for (auto const& primitive : _primitives)
452 if(primitive.second->can_be_optimized())
453 ret.push_back("_optimized_");
455 ret.push_back(primitive.second->id());
459 std::vector<primitive_id> network_impl::get_all_primitive_org_ids() const
461 std::vector<primitive_id> ret;
462 ret.reserve(_primitives.size());
463 for (auto const& primitive : _primitives)
464 ret.push_back(primitive.second->org_id());
468 std::shared_ptr<primitive_inst> network_impl::get_primitive(const primitive_id& id)
470 if (!_primitives.count(id))
471 allocate_primitive_instance(_program->get_node(id));
473 return _primitives.at(id);
476 std::vector<std::shared_ptr<primitive_inst>> network_impl::get_primitives(const std::vector<primitive_id>& ids)
478 std::vector<std::shared_ptr<primitive_inst>> result(ids.size());
479 std::transform(std::begin(ids), std::end(ids), std::begin(result), [&](const primitive_id& id) { return get_primitive(id); });
483 std::vector<std::shared_ptr<primitive_inst>> network_impl::get_primitives(const std::vector<program_node*>& nodes)
485 std::vector<std::shared_ptr<primitive_inst>> result(nodes.size());
486 std::transform(std::begin(nodes), std::end(nodes), std::begin(result), [&](const program_node* node) { return get_primitive(node->id()); });
490 void network_impl::execute_primitive(const std::shared_ptr<primitive_inst>& primitive, const std::vector<refcounted_obj_ptr<event_impl>>& events)
492 auto id = primitive->id();
493 auto it = _events.find(id);
494 bool found = (it != _events.end());
495 CLDNN_ERROR_BOOL(id, "Invalid primitive call ", found, "Primitive " + id + " is tried to be executed for the second time");
498 if (!get_engine().get_context()->enabled_single_kernel() || get_engine().get_context()->single_kernel_name() == id)
499 ev = primitive->execute(events);
501 ev = get_engine().create_user_event(true);
502 _events.insert({ id, ev });
505 void network_impl::allocate_primitive_instance(program_node const& node)
507 if (_primitives.count(node.id()))
510 auto inst = node.type()->create_instance(*this, node);
511 _primitives[node.id()] = inst;
513 _inputs.push_back(inst);
514 if (node.is_output())
516 _outputs.push_back(inst);
517 if (node.is_type<data>())
518 _data_outputs.push_back(inst);