inference-engine/thirdparty/clDNN/src/network.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18 #include "network_impl.h"
  19 #include "engine_impl.h"
  20 #include "event_impl.h"
  21 #include "program_impl.h"
  22 #include "api/CPP/data.hpp"
  23 #include "api/CPP/mutable_data.hpp"
  24 #include "api/CPP/input_layout.hpp"
  25
  26 #include "error_handler.h"
  27 #include "primitive_inst.h"
  28 #include "input_layout_inst.h"
  29 #include "condition_inst.h"
  30 #include "kernel_selector_helper.h"
  31 #include <algorithm>
  32
  33 #include "gpu/ocl_toolkit.h"
  34
  35
  36 //#define DEBUG_DUMP_PATH "/tmp/dump/"
  37
  38
  39 #ifdef DEBUG_DUMP_PATH
  40 #include <iomanip>
  41 #include <fstream>
  42
  43 #define DUMP_VERBOSE 0
  44 #define DUMP_SINGLE_LAYER 0
  45 #define DUMP_LAYER_NAME ""
  46 #endif
  47
  48 namespace cldnn
  49 {
  50 #ifdef DEBUG_DUMP_PATH
  51 static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false)
  52     {
  53 #if defined HALF_HALF_HPP
  54         return val;
  55 #else
  56         // FP32 parts extracted from FP16.
  57         uint32_t sign = (static_cast<uint16_t>(val) & 0x8000U) << 16;
  58         uint32_t mantissa = (static_cast<uint16_t>(val) & 0x3FFU) << 13;
  59
  60         uint32_t exp_val_f16 = (static_cast<uint16_t>(val) & 0x7C00U) >> 10;
  61         uint32_t exp;
  62         if (exp_val_f16 == 0)
  63         {
  64             // Handling +/-0 and denormals.
  65             if (mantissa == 0)
  66             {
  67                 exp = 0;
  68             }
  69             else if (flush_denorm_to_zero)
  70             {
  71                 sign = 0;
  72                 exp = 0;
  73                 mantissa = 0;
  74             }
  75             else
  76             {
  77                 // Denorms conversion to normal numbers.
  78                 exp = 127 - 15;
  79                 while (!(mantissa & 0x400000U))
  80                 {
  81                     mantissa <<= 1;
  82                     --exp;
  83                 }
  84                 mantissa = (mantissa << 1) & 0x7FFFFFU;
  85                 exp <<= 23;
  86             }
  87         }
  88         else
  89         {
  90             // Handling +/-infinity, NaN and normal numbers.
  91             exp = (exp_val_f16 == 0x1FU ? 0xFFU : exp_val_f16 + 127 - 15) << 23;
  92         }
  93
  94         float ret;
  95         reinterpret_cast<uint32_t&>(ret) = sign | exp | mantissa;
  96
  97         return ret;
  98 #endif
  99     }
 100
 101     float convert_element(float f)
 102     {
 103         return f;
 104     }
 105
 106     float convert_element(half_t h)
 107     {
 108         return convert_half_to_float(h);
 109     }
 110
 111     template <class T>
 112     static void dump(memory_impl& mem, std::ofstream& file_stream)
 113     {
 114         auto&& size = mem.get_layout().size;
 115
 116         file_stream << "shape: ";
 117         file_stream << size.batch[0] << " ";
 118         file_stream << size.feature[0] << " ";
 119         file_stream << size.spatial[1] << " ";
 120         file_stream << size.spatial[0] << " ";
 121         file_stream << "(" << size.batch[0] * size.feature[0] * size.spatial[1] * size.spatial[0] << ")" << std::endl;
 122
 123         auto mem_ptr = static_cast<T*>(mem.lock());
 124
 125         for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b)
 126         {
 127             for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f)
 128             {
 129                 for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y)
 130                 {
 131                     for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x)
 132                     {
 133                         cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y));
 134                         size_t input_it = mem.get_layout().get_linear_offset(t);
 135                         file_stream << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
 136                     }
 137                 }
 138             }
 139         }
 140
 141         mem.unlock();
 142     }
 143
 144     static void log_memory_to_file(memory_impl& mem, std::string layerName)
 145     {
 146         std::string filename = layerName;
 147         std::replace(filename.begin(), filename.end(), '\\', '_');
 148         std::replace(filename.begin(), filename.end(), '/', '_');
 149         std::replace(filename.begin(), filename.end(), ' ', '_');
 150         std::replace(filename.begin(), filename.end(), ':', '_');
 151         filename = DEBUG_DUMP_PATH + filename + ".txt";
 152
 153         std::ofstream file_stream(filename);
 154         if (mem.get_layout().data_type == cldnn::data_types::f32)
 155             dump<float>(mem, file_stream);
 156         else
 157             dump<half_t>(mem, file_stream);
 158     }
 159 #endif
 160 /*
 161 Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants opt pass).
 162 */
 163 network_impl::network_impl(const program_impl& program, bool is_internal)
 164     : _program(&program)
 165     , _internal(is_internal)
 166 {
 167     static std::atomic<uint32_t> id_gen{ 0 };
 168     if (!_internal)
 169     {
 170         net_id = ++id_gen;
 171     }
 172
 173     allocate_primitives();
 174     check_names();
 175     build_insts_deps();
 176     build_exec_order();
 177     validate_primitives();
 178     _program->dump_memory_pool();
 179 }
 180
 181 network_impl::network_impl(engine_impl& engine, const topology_impl& topo, const build_options& options, bool is_internal)
 182     : network_impl(*engine.build_program(topo, options, is_internal), is_internal)
 183 {
 184 }
 185
 186 network_impl::network_impl(engine_impl& engine, const std::set<std::shared_ptr<program_node>>& nodes, const build_options& options, bool is_internal)
 187     : network_impl(*engine.build_program(nodes, options, is_internal), is_internal)
 188 {
 189 }
 190
 191 void network_impl::validate_primitives()
 192 {
 193     for (auto const& prim : _exec_order)
 194     {
 195         bool valid = prim->validate();
 196         CLDNN_ERROR_NOT_EQUAL(prim->id(), "validate", valid, "", true, "has not a valid instance.");
 197     }
 198 }
 199
 200 void network_impl::reset_execution(bool wait)
 201 {
 202     if (wait && _events.size() > 0)
 203     {
 204         std::vector<event_impl::ptr> events;
 205         for (auto& pair : _events)
 206         {
 207             auto& ev = pair.second;
 208             if (ev->is_set())
 209                 continue;
 210
 211             events.push_back(ev);
 212         }
 213
 214         get_engine().wait_for_events(events);
 215     }
 216     _events.clear();
 217 }
 218
 219 void network_impl::set_input_data(const primitive_id& id, memory_impl& data)
 220 {
 221     std::shared_ptr<primitive_inst> primitive_inst;
 222
 223     primitive_inst = find_primitive(id);
 224
 225     if(primitive_inst == nullptr)
 226         throw std::runtime_error("topology doesn't contain prmitive:" + id);
 227
 228     if (primitive_inst->type() != input_layout::type_id())
 229     {
 230         CLDNN_ERROR_MESSAGE(id, "primitive " + id + " is not an input");
 231     }
 232
 233     auto input = std::static_pointer_cast<input_layout_inst>(primitive_inst);
 234
 235     //Wait for previous execution completion
 236     reset_execution(true);
 237     input->set_data(data);
 238 }
 239
 240 void cldnn::network_impl::check_names()
 241 {
 242     for (auto const& prim : _primitives)
 243     {
 244         if (find_in_internal_networks(prim.first) != nullptr)
 245             CLDNN_ERROR_MESSAGE("Network_impl", "Found primitive with id: " + prim.first
 246                 + "in anotother network.");
 247     }
 248 }
 249
 250 std::shared_ptr<primitive_inst> cldnn::network_impl::find_primitive(const primitive_id& id)
 251 {
 252     std::shared_ptr<primitive_inst> ret;
 253
 254     if (_primitives.find(id) != _primitives.end())
 255         return _primitives.at(id);
 256
 257     return find_in_internal_networks(id);
 258 }
 259
 260 std::shared_ptr<primitive_inst> cldnn::network_impl::find_in_internal_networks(const primitive_id& id)
 261 {
 262     std::shared_ptr<primitive_inst> ret;
 263
 264     for (auto const& prim : _primitives)
 265     {
 266         if (prim.second->type() == condition::type_id()) //currently only condition inst contains mini networks
 267         {
 268             auto cond_inst = std::static_pointer_cast<condition_inst>(prim.second);
 269             ret = cond_inst->get_net_true()->find_primitive(id);
 270             if (ret != nullptr)
 271                 return ret;
 272             ret = cond_inst->get_net_false()->find_primitive(id);
 273             if (ret != nullptr)
 274                 return ret;
 275         }
 276     }
 277     return nullptr;
 278 }
 279
 280 void network_impl::set_learning_rate(const float lr)
 281 {
 282     _learning_rate = lr;
 283 }
 284
 285 float network_impl::get_learning_rate()
 286 {
 287     return _learning_rate;
 288 }
 289
 290 std::string network_impl::get_primitive_info(const primitive_id& id) const
 291 {
 292     const auto& node = _program->get_node(id);
 293     return node.type()->to_string(node);
 294 }
 295
 296 void network_impl::allocate_primitives()
 297 {
 298     std::vector<std::shared_ptr<program_node>> nodes_to_allocate{};
 299     for (auto node : _program->get_processing_order())
 300     {
 301         nodes_to_allocate.push_back(_program->get_node_ptr(node->id()));
 302     }
 303     std::sort(nodes_to_allocate.begin(), nodes_to_allocate.end(), [](std::shared_ptr<program_node> const& lhs,
 304                                                                      std::shared_ptr<program_node> const& rhs)
 305     {
 306         return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
 307     });
 308
 309     for (auto const& node : nodes_to_allocate)
 310     {
 311         allocate_primitive_instance(*node);
 312     }
 313 }
 314
 315 void network_impl::build_insts_deps()
 316 {
 317     for (auto& inst : _primitives)
 318     {
 319         inst.second->build_deps();
 320     }
 321 }
 322
 323 void network_impl::build_exec_order()
 324 {
 325     for (auto& node : _program->get_processing_order())
 326     {
 327         if (!node->is_type<data>() &&
 328             !(node->is_type<mutable_data>() && node->get_dependencies().empty()))
 329         {
 330             add_to_exec_order(node->id());
 331         }
 332     }
 333 }
 334 void network_impl::add_to_exec_order(const primitive_id& id)
 335 {
 336     auto inst = get_primitive(id);
 337     _exec_order.push_back(inst);
 338 }
 339
 340 void network_impl::execute(const std::vector<refcounted_obj_ptr<event_impl>>& events)
 341 {
 342     //Wait for previous execution completion
 343     reset_execution(false);
 344
 345     for (auto& inst : _exec_order)
 346     {
 347 #ifdef DEBUG_DUMP_PATH
 348         auto& node = _program->get_node(inst->id());
 349
 350         std::string layer_name = node.id();
 351 #if DUMP_VERBOSE
 352         std::cerr << get_primitive_info(inst->id()) << std::endl;
 353 #endif
 354 #if DUMP_SINGLE_LAYER
 355         if (layer_name == DUMP_LAYER_NAME)
 356 #endif
 357         {
 358             std::cerr << "Dump " << layer_name << " layer" << std::endl;
 359             for (size_t i = 0; i < get_primitive(inst->id())->inputs_memory_count(); i++)
 360             {
 361                 log_memory_to_file(get_primitive(inst->id())->input_memory(i), layer_name + "_src_" + std::to_string(i));
 362             }
 363         }
 364 #endif
 365         execute_primitive(inst, events);
 366 #ifdef DEBUG_DUMP_PATH
 367 #if DUMP_SINGLE_LAYER
 368         if (layer_name == DUMP_LAYER_NAME)
 369 #endif
 370         {
 371             log_memory_to_file(get_primitive(inst->id())->output_memory(), layer_name + "_dst_0");
 372         }
 373         get_engine().flush_network();
 374 #endif
 375     }
 376
 377     for (auto& inst : _program->get_processing_order())
 378     {
 379         //Special handling for mutable data. The event should be the same as the user or dependency with highest processing_num as
 380         //the mutable_data can be updated when is both user or dependency.
 381         if (inst->is_type<mutable_data>())
 382         {
 383             decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0;
 384             for (auto& user : inst->get_users())
 385             {
 386                 auto user_proc_num = _program->get_processing_order().get_processing_number(user);
 387                 if (user_proc_num > proc_num)
 388                 {
 389                     _events[inst->id()] = _events[user->id()];
 390                     proc_num = user_proc_num;
 391                 }
 392             }
 393
 394             if (!inst->get_dependencies().empty())
 395             {
 396                 for (auto& dep : inst->get_dependencies())
 397                 {
 398                     auto dep_proc_num = _program->get_processing_order().get_processing_number(dep);
 399                     if (dep_proc_num > proc_num)
 400                     {
 401                         _events[inst->id()] = _events[dep->id()];
 402                         proc_num = dep_proc_num;
 403                     }
 404                 }
 405             }
 406         }
 407     }
 408
 409     for (auto& dout : _data_outputs) //data primitives are not executed so if they are marked as output we need to add them valid events manually
 410     {
 411         _events[dout->id()] = get_engine().create_user_event(true);
 412     }
 413
 414     for (auto& prim : _primitives)
 415     {
 416         prim.second->reset_output_change();
 417     }
 418
 419     get_engine().get_context()->reset_events();
 420
 421     // Using output of previouse network as input to another one may cause hazard (in OOOQ mode) if user would not
 422     // provide proper event to execution. Flushing pipeline should prevent this kind of issues.
 423     // In scenarios with a big number of very small networks it can provide performance drop.
 424     get_engine().flush_network();
 425 }
 426
 427 std::vector<primitive_id> network_impl::get_output_ids() const
 428 {
 429     std::vector<primitive_id> ret;
 430     ret.reserve(_outputs.size());
 431     for (auto const& output : _outputs)
 432         ret.push_back(output->id());
 433     return ret;
 434 }
 435
 436 std::vector<primitive_id> network_impl::get_executed_primitive_ids() const
 437 {
 438     std::vector<primitive_id> ret;
 439     ret.reserve(_exec_order.size());
 440     for (auto const& executed_primitive : _exec_order)
 441     {
 442         ret.push_back(executed_primitive->id());
 443     }
 444     return ret;
 445 }
 446
 447 std::vector<primitive_id> network_impl::get_all_primitive_ids() const
 448 {
 449     std::vector<primitive_id> ret;
 450     ret.reserve(_primitives.size());
 451     for (auto const& primitive : _primitives)
 452         if(primitive.second->can_be_optimized())
 453             ret.push_back("_optimized_");
 454         else
 455             ret.push_back(primitive.second->id());
 456     return ret;
 457 }
 458
 459 std::vector<primitive_id> network_impl::get_all_primitive_org_ids() const
 460 {
 461     std::vector<primitive_id> ret;
 462     ret.reserve(_primitives.size());
 463     for (auto const& primitive : _primitives)
 464         ret.push_back(primitive.second->org_id());
 465     return ret;
 466 }
 467
 468 std::shared_ptr<primitive_inst> network_impl::get_primitive(const primitive_id& id)
 469 {
 470     if (!_primitives.count(id))
 471         allocate_primitive_instance(_program->get_node(id));
 472
 473     return _primitives.at(id);
 474 }
 475
 476 std::vector<std::shared_ptr<primitive_inst>> network_impl::get_primitives(const std::vector<primitive_id>& ids)
 477 {
 478     std::vector<std::shared_ptr<primitive_inst>> result(ids.size());
 479     std::transform(std::begin(ids), std::end(ids), std::begin(result), [&](const primitive_id& id) { return get_primitive(id); });
 480     return result;
 481 }
 482
 483 std::vector<std::shared_ptr<primitive_inst>> network_impl::get_primitives(const std::vector<program_node*>& nodes)
 484 {
 485     std::vector<std::shared_ptr<primitive_inst>> result(nodes.size());
 486     std::transform(std::begin(nodes), std::end(nodes), std::begin(result), [&](const program_node* node) { return get_primitive(node->id()); });
 487     return result;
 488 }
 489
 490 void network_impl::execute_primitive(const std::shared_ptr<primitive_inst>& primitive, const std::vector<refcounted_obj_ptr<event_impl>>& events)
 491 {
 492     auto id = primitive->id();
 493     auto it = _events.find(id);
 494     bool found = (it != _events.end());
 495     CLDNN_ERROR_BOOL(id, "Invalid primitive call ", found, "Primitive " + id + " is tried to be executed for the second time");
 496
 497     event_impl::ptr ev;
 498     if (!get_engine().get_context()->enabled_single_kernel() || get_engine().get_context()->single_kernel_name() == id)
 499         ev = primitive->execute(events);
 500     else
 501         ev = get_engine().create_user_event(true);
 502     _events.insert({ id, ev });
 503 }
 504
 505 void network_impl::allocate_primitive_instance(program_node const& node)
 506 {
 507     if (_primitives.count(node.id()))
 508         return;
 509
 510     auto inst = node.type()->create_instance(*this, node);
 511     _primitives[node.id()] = inst;
 512     if (node.is_input())
 513         _inputs.push_back(inst);
 514     if (node.is_output())
 515     {
 516         _outputs.push_back(inst);
 517         if (node.is_type<data>())
 518             _data_outputs.push_back(inst);
 519     }
 520 }
 521 }