inference-engine/src/gna_plugin/gna_plugin.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #define NOMINMAX
   6 #include "cpp_interfaces/base/ie_plugin_base.hpp"
   7 #include "gna_plugin.hpp"
   8 #include "ie_plugin_config.hpp"
   9 #include "debug.h"
  10 #include "blob_factory.hpp"
  11 #include "gna_plugin_log.hpp"
  12 #include "gna_layer_info.hpp"
  13 #include <utility>
  14 #include <limits>
  15 #include "ie_memcpy.h"
  16
  17 #ifdef PLOT
  18 void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t* pNeuralNetwork);
  19 #endif
  20
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23 #include <iostream>
  24 #include <fstream>
  25 #include <stdexcept>
  26 #include <vector>
  27 #include <malloc.h>
  28 #include <math.h>
  29 #include <string.h>
  30 #include <list>
  31 #include <algorithm>
  32 #include <map>
  33 #include <string>
  34 #include <unordered_map>
  35 #include <unordered_set>
  36 #include <memory>
  37 #include <dnn_memory.hpp>
  38 #include <ie_layers.h>
  39 #include "details/caseless.hpp"
  40 #include <gna-api-types-xnn.h>
  41 #include "gna-api.h"
  42 #include "gna-api-dumper.h"
  43 #include "dnn.h"
  44 #include "pwl.h"
  45 #include "util.h"
  46 #include "quantization/quantization.h"
  47 #include "lstm.hpp"
  48 #include "graph_tools.hpp"
  49 #include "gna_plugin_config.hpp"
  50 #include "gna/gna_config.hpp"
  51 #include "quantization/model_quantizer.hpp"
  52 #include "gna_model_serial.hpp"
  53 #include "gna_memory_state.hpp"
  54 #include "details/ie_cnn_network_tools.h"
  55
  56 using namespace InferenceEngine;
  57 using namespace std;
  58 using namespace GNAPluginNS;
  59 using namespace InferenceEngine::details;
  60
  61 #ifdef VERBOSE
  62 #define VERBOSE_LEVEL (1)
  63 #else
  64 #define VERBOSE_LEVEL (0)
  65 #endif
  66
  67 #ifdef PLOT
  68 #define PLOT_LEVEL (1)
  69 #else
  70 #define PLOT_LEVEL (0)
  71 #endif
  72
  73
  74 #define PAGE_SIZE_BYTES 4096
  75
  76 #define FROM_IR_DIM(mem, idx)\
  77 ((mem->dims.size() > idx - 1) ? mem->dims[idx - 1] : 1)
  78
  79 inline int16_t GNAPluginNS::ConvertFloatToInt16(float src) {
  80         float rounding_value = (src > 0) ? 0.5f : -0.5f;
  81         float value = src + rounding_value;
  82         if (value > 32767.0) {
  83             return 32767;
  84         } else if (value < -32768.0) {
  85             return -32768;
  86         }
  87         return (int16_t)value;
  88 }
  89
  90 void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
  91                     const float *ptr_src,
  92                     const uint32_t num_rows,
  93                     const uint32_t num_columns,
  94                     const float scale_factor) {
  95     if (!ptr_dst || !ptr_src) {
  96         return;
  97     }
  98     for (uint32_t i = 0; i < num_rows*num_columns; i++) {
  99         ptr_dst[i] = GNAPluginNS::ConvertFloatToInt16(ptr_src[i]*scale_factor);
 100     }
 101 }
 102 void GNAPluginNS::ConvertToFloat(float *ptr_dst,
 103                     int32_t *ptr_src,
 104                     const uint32_t num_rows,
 105                     const uint32_t num_columns,
 106                     const float scale_factor) {
 107     if (!ptr_dst || !ptr_src) {
 108         return;
 109     }
 110     for (uint32_t i = 0; i < num_rows; i++) {
 111         int32_t *ptr_int_row = ptr_src + i * num_columns;
 112         float *ptr_float_row = ptr_dst + i * num_columns;
 113         for (uint32_t j = 0; j < num_columns; j++) {
 114             ptr_float_row[j] = static_cast<float>(ptr_int_row[j]) / scale_factor;
 115         }
 116     }
 117 }
 118
 119 template <typename T, typename U>
 120 void GNAPlugin::copyInputData(T *dst,
 121                 const U *src,
 122                 uint32_t num_frames,
 123                 uint32_t num_group,
 124                 uint32_t num_vector_elements,
 125                 uint32_t num_vector_stride,
 126                 intel_dnn_orientation_t orientation) {
 127     if (!dst || !src) {
 128         return;
 129     }
 130     if (orientation == kDnnInterleavedOrientation) {
 131         for (uint32_t i = 0; i < num_frames; i++) {
 132             for (uint32_t j = 0; j < num_vector_elements; j++) {
 133                 if (!std::is_same<T, U>::value) {
 134                     dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * get_input_scale_factor());
 135                 } else {
 136                     dst[j * num_group + i] = src[i * num_vector_elements + j];
 137                 }
 138             }
 139             // pad to meet weight matrix row length requirement
 140             for (uint32_t j = num_vector_elements; j < num_vector_stride; j++) {
 141                 dst[j * num_group + i] = 0;
 142             }
 143         }
 144         // pad partial group
 145         for (uint32_t i = num_frames; i < num_group; i++) {
 146             for (uint32_t j = 0; j < num_vector_stride; j++) {
 147                 dst[j * num_group + i] = 0;
 148             }
 149         }
 150     } else {
 151         if (!std::is_same<T, U>::value) {
 152             for (uint32_t i = 0; i < num_frames; i++) {
 153                 T *ptr_dst_vec = const_cast<T *>(reinterpret_cast<const T *>(dst) + i * num_vector_stride);
 154                 U *ptr_src_vec = const_cast<U *>(reinterpret_cast<const U *>(src) + i * num_vector_elements);
 155                 std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
 156                 for (int j=0; j < num_vector_elements; j++) {
 157                     ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * get_input_scale_factor());
 158                 }
 159             }
 160
 161         } else {
 162             for (uint32_t i = 0; i < num_frames; i++) {
 163                 void *ptr_dst_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(dst) + i * num_vector_stride * sizeof(T));
 164                 void *ptr_src_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(src) + i * num_vector_elements * sizeof(U));
 165                 std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
 166                 std::memcpy(ptr_dst_vec, ptr_src_vec, num_vector_elements * sizeof(T));
 167             }
 168         }
 169
 170         for (uint32_t i = num_frames; i < num_group; i++) {
 171             void *ptr_dst_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(dst) + i * num_vector_stride * sizeof(T));
 172             std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
 173         }
 174     }
 175 }
 176
 177 template <typename T, typename U>
 178 void GNAPlugin::copyInputDataWithSplit(T *const dst,
 179                 const U *src,
 180                 const GNASplitLayer& splitInfo,
 181                 size_t precision_size) {
 182     if (!dst || !src) {
 183         return;
 184     }
 185     T *dst_ptr = dst;
 186     const U *src_ptr = src;
 187     precision_size = sizeof(T);
 188     // we found split/slice layer connected to Input
 189     for (auto&& outputLayer : splitInfo.splitOutputLayers) {
 190         uint32_t begin = outputLayer.offset/precision_size;
 191         uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size;
 192         if (dst_ptr - dst >= end) {
 193             // output layer with bind pointer as previous one. Skip
 194             continue;
 195         }
 196         for (uint32_t i = begin; i < end; ++i) {
 197             if (!std::is_same<T, U>::value) {
 198                 *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * get_input_scale_factor());
 199             } else {
 200                 *(dst_ptr++) = *(src_ptr++);
 201             }
 202         }
 203         begin = end;
 204         end = (outputLayer.offset + ALIGN64(outputLayer.pure_size))/precision_size;
 205         std::memset(dst_ptr, 0, (end - begin )* sizeof(uint16_t));
 206         dst_ptr += end - begin;
 207     }
 208 }
 209
 210 void GNAPlugin::ExportScores(void *ptr_dst,
 211                   void *ptr_src,
 212                   intel_dnn_orientation_t orientation,
 213                   uint32_t num_frames,
 214                   uint32_t num_group,
 215                   uint32_t num_vector_elements,
 216                   uint32_t num_active_elements,
 217                   uint32_t num_vector_stride,
 218                   uint32_t num_bytes_per_element_input,
 219                   uint32_t num_bytes_per_element) {
 220     // source scores are possibly padded to multiple of 8 and possibly interleaved
 221     // rotate if necessary and only copy actual scores (not padding)
 222     if (orientation == kDnnInterleavedOrientation) {
 223         if (num_bytes_per_element == 2) {
 224             int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
 225             int16_t *src = reinterpret_cast<int16_t *>(ptr_src);
 226             for (uint32_t i = 0; i < num_frames; i++) {
 227                 for (uint32_t j = 0; j < num_active_elements; j++) {
 228                     dst[i * num_vector_elements + j] = src[j * num_group + i];
 229                 }
 230                 for (uint32_t j = num_active_elements; j < num_vector_elements; j++) {
 231                     dst[i * num_vector_elements + j] = 0;
 232                 }
 233             }
 234         } else if (num_bytes_per_element == 4) {  // should work for both int and float
 235             int32_t *dst = reinterpret_cast<int32_t *>(ptr_dst);
 236             int8_t *src = reinterpret_cast<int8_t*>(ptr_src);
 237             for (uint32_t i = 0; i < num_frames; i++) {
 238                 for (uint32_t j = 0; j < num_active_elements; j++) {
 239                     auto input_ptr = src + (j * num_group + i) * num_bytes_per_element_input;
 240                     auto dst_ptr = dst + (i * num_vector_elements + j);
 241
 242                     switch (num_bytes_per_element_input) {
 243                         case 2 : {
 244                             *dst_ptr  = static_cast<int32_t>(*reinterpret_cast<int16_t*>(input_ptr));
 245                             break;
 246                         }
 247                         case 4 : {
 248                             *dst_ptr  = *reinterpret_cast<int32_t*>(input_ptr);
 249                             break;
 250                         }
 251                         default:
 252                             THROW_GNA_EXCEPTION << "Unsupported output layer precision: " << num_bytes_per_element_input << "bytes";
 253                     }
 254                 }
 255                 for (uint32_t j = num_active_elements; j < num_vector_elements; j++) {
 256                     dst[i * num_vector_elements + j] = 0;
 257                 }
 258             }
 259         } else {
 260             THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
 261         }
 262     } else {
 263         if (num_bytes_per_element == 2) {
 264             for (uint32_t i = 0; i < num_frames; i++) {
 265                 void *ptr_dst_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(int16_t));
 266                 void *ptr_src_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(int16_t));
 267                 memset(ptr_dst_vec, 0, num_vector_elements * sizeof(int16_t));
 268                 memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(int16_t));
 269             }
 270         } else if (num_bytes_per_element == 4) {  // should work for both int and float
 271             for (uint32_t i = 0; i < num_frames; i++) {
 272                 void *ptr_dst_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(float));
 273                 void *ptr_src_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(float));
 274                 memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float));
 275                 memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(float));
 276             }
 277         } else {
 278             THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
 279         }
 280     }
 281 }
 282
 283 void GNAPlugin::ImportFrames(
 284                   void *ptr_dst,
 285                   const void *ptr_src,
 286                   Precision input_precision,
 287                   intel_dnn_orientation_t orientation,
 288                   uint32_t num_frames,
 289                   uint32_t num_group,
 290                   uint32_t num_vector_elements,
 291                   uint32_t num_vector_stride) {
 292     if (orientation == kDnnInterleavedOrientation) {
 293         // TODO : fix that as well
 294         if (input_precision == Precision::U8) {
 295             int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
 296             uint8_t *src = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(ptr_src));
 297             copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 298         } else if (input_precision.size() == 2) {
 299             int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
 300             int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
 301             copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 302         } else if (input_precision.size() == 4) {
 303             if (!gnadevice) {
 304                 float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
 305                 float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
 306                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 307             } else {
 308                 int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
 309                 const float *src = reinterpret_cast<const float *>(ptr_src);
 310                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 311             }
 312         }
 313     } else {
 314         if (input_precision == Precision::U8) {
 315             uint8_t *src = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(ptr_src));
 316             if (!gnadevice) {
 317                 float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
 318                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 319             } else {
 320                 int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
 321                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 322             }
 323
 324         } else if (input_precision.size()== 2) {
 325             int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
 326             int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
 327             copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 328         } else if (input_precision.size() == 4) {
 329             if (!gnadevice) {
 330                 float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
 331                 float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
 332                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 333             } else {
 334                 uint16_t *dst = const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(ptr_dst));
 335                 float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
 336                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
 337             }
 338         }
 339     }
 340 }
 341
 342 void GNAPlugin::fillMemoryConnections(std::unordered_map<std::string,
 343                                             std::vector<InferenceEngine::CNNLayerPtr>>& memoryPairs) {
 344     for (auto &memory : memoryPairs) {
 345         auto inputLayer = memory.second[1];
 346         auto outputLayer = memory.second[0];
 347
 348         IE_ASSERT(1 == outputLayer->insData.size());
 349
 350         // creating connection for layers output as form of extramap
 351         memory_connection.emplace_back(memory.first, GNAMemoryLayer(inputLayer, outputLayer));
 352     }
 353 }
 354
 355 void GNAPlugin::fillConcatConnections(InferenceEngine::CNNLayerPtr layer) {
 356     // creating connection for each layer outputs as form of extramap
 357     GNAPlugin::GNAConcatLayer layerInfoItem(layer);
 358     size_t concat_size = 0;
 359     std::string& id = layer->name;
 360
 361     for (size_t i = 0; i < layer->insData.size(); ++i) {
 362         auto dataInput = layer->insData[i].lock();
 363         if (!dataInput) {
 364             THROW_GNA_EXCEPTION << "Input layer pointer for concat is unexpectedly absent";
 365         }
 366
 367         auto ptrConcatLayerInput = dataInput->creatorLayer.lock();
 368         if (!ptrConcatLayerInput) {
 369             THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent";
 370         }
 371         layerInfoItem.concatInputLayers.emplace_back(
 372                 GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo({ptrConcatLayerInput->name, concat_size}));
 373
 374         size_t layer_size =
 375                      InferenceEngine::details::product(begin(dataInput->dims),
 376                                                       end(dataInput->dims)) * dataInput->precision.size();
 377         concat_size += layer_size;
 378     }
 379     layerInfoItem.reserved_size = concat_size;
 380     concat_connection.emplace(id, layerInfoItem);
 381 }
 382
 383 void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
 384     // creating connection for each layer inputs as form of extramap
 385     GNAPlugin::GNASplitLayer layerInfoItem(layer);
 386     size_t split_size = 0;
 387     std::string& id = layer->name;
 388     auto dataInput = layer->insData.begin()->lock();
 389     if (!dataInput) {
 390         THROW_GNA_EXCEPTION << "Input layer pointer for split/slice is unexpectedly absent";
 391     }
 392     auto ptrSplitLayerInput = dataInput->creatorLayer.lock();
 393     if (!ptrSplitLayerInput) {
 394         THROW_GNA_EXCEPTION << "Input layer for split/slice is unexpectedly absent";
 395     }
 396
 397     LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput);
 398     for (size_t i = 0; i < layer->outData.size(); ++i) {
 399         size_t padding = 0;
 400         size_t output_layer_size = 0;
 401         auto& dataOutput = layer->outData[i];
 402
 403         if (!dataOutput || !dataInput) {
 404             THROW_GNA_EXCEPTION << "Output layer pointer for split/slice is unexpectedly absent";
 405         }
 406
 407         for (auto&& ptrSplitLayerOutputPair : dataOutput->getInputTo()) {
 408             auto& ptrSplitLayerOutput = ptrSplitLayerOutputPair.second;
 409             if (!ptrSplitLayerOutput) {
 410                 THROW_GNA_EXCEPTION << "Output layer for split/slice is unexpectedly absent";
 411             }
 412
 413             padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize())
 414                                                         * dataOutput->precision.size();
 415             output_layer_size =
 416                     InferenceEngine::details::product(begin(dataOutput->dims),
 417                                                      end(dataOutput->dims)) * dataOutput->precision.size();
 418
 419             if (ptrSplitLayerOutput->type == "AffineFilter") {
 420                 size_t aligned64_offset = ptrSplitLayerOutput->GetParamAsInt("offset");
 421                 layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, aligned64_offset, output_layer_size);
 422             } else {
 423                 layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, output_layer_size);
 424             }
 425         }
 426
 427         split_size += padding + output_layer_size;
 428     }
 429     layerInfoItem.reserved_size = split_size;
 430     layerInfoItem.splitInputLayer =
 431                     GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo({ptrSplitLayerInput->type, 0,
 432                                                                     InferenceEngine::details::product(begin(dataInput->dims),
 433                                                                     end(dataInput->dims)) * dataInput->precision.size()});
 434     split_connection.emplace(id, layerInfoItem);
 435 }
 436
 437 void GNAPlugin::DiagonalPrimitive(InferenceEngine::CNNLayerPtr layer) {
 438     AffinePrimitive(layer, true);
 439 }
 440
 441 void GNAPlugin::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer) {
 442     auto &convolution = dynamic_cast<ConvolutionLayer &>(*layer.get());
 443     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 444
 445     auto inputs = layer->insData.begin()->lock();
 446     auto outputs = *layer->outData.begin();
 447
 448     uint32_t num_feature_map_rows = FROM_IR_DIM(inputs, 1) / convolution._stride_x;
 449     uint32_t num_feature_map_columns = FROM_IR_DIM(inputs, 3) * convolution._stride_x / num_feature_maps;
 450
 451     uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
 452     uint32_t num_columns_in = FROM_IR_DIM(inputs, 3);
 453     uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
 454     uint32_t num_padding = ALIGN(convolution._kernel_x * num_feature_map_columns * num_feature_maps, 8)
 455                                             - convolution._kernel_x * num_feature_map_columns * num_feature_maps;
 456     void *ptr_inputs;
 457     void *ptr_outputs;
 458     void *ptr_weights;
 459     void *ptr_biases;
 460
 461     // TODO: questionable why for biases that are no in IR we inventing precision
 462     auto biasPrecision = convolution._biases ? convolution._biases->precision() : outputs->precision;
 463
 464     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 465     auto &currentComponent = dnnComponentsForLayer.back().second;
 466
 467 #ifdef PLOT
 468     cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n";
 469 #endif
 470     auto num_input_padding = ALIGN(num_feature_maps * num_feature_map_columns * num_feature_map_rows, 8)
 471                                                         -  num_feature_maps * num_feature_map_columns * num_feature_map_rows;
 472     auto num_filter_rows = convolution._kernel_x / convolution._stride_x;
 473     dnn.InitConvolutional1DComponent(currentComponent,
 474                             1,
 475                             num_feature_maps *  num_feature_map_columns * num_feature_map_rows + num_input_padding,
 476                             1,
 477                             num_rows_out * convolution._out_depth,
 478                             inputs->precision.size(),
 479                             outputs->precision.size(),
 480                             convolution._weights->precision().size(),
 481                             biasPrecision.size(),
 482                             convolution._out_depth,
 483                             num_filter_rows,
 484                             num_feature_maps * num_feature_map_columns * num_filter_rows + num_padding,
 485
 486                             num_feature_maps,  // interesting - why this is so in gna_example
 487                             num_feature_map_rows,
 488                             num_feature_map_columns,
 489
 490                             quantized == nullptr ? 1 : quantized->_weights_quant.scale,
 491                             quantized == nullptr ? 1 : quantized->_dst_quant.scale,
 492                             ptr_inputs,
 493                             ptr_outputs,
 494                             ptr_weights,
 495                             ptr_biases);
 496
 497     // update num_feature_maps for next convolutional layer
 498     num_feature_maps = convolution._out_depth;  // = number of filters
 499
 500     size_t num_data_bytes_out =
 501                         InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
 502                                                                                 * outputs->precision.size();
 503
 504     size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
 505
 506     auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input;
 507
 508     // TODO: convolution might be not the first layer in sorted order but connected via split for example - dont know how kaldi will handle that
 509     if (LayerInfo(connectedInputLayer).isInput()) {
 510         //  Kaldi features are opposite orientation
 511         dnn.num_rotate_rows = num_feature_map_columns;
 512         dnn.num_rotate_columns = num_feature_map_rows;
 513     }
 514
 515     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 516
 517     // rotate
 518     auto TransposeMatrix = [](uint8_t *ptr_matrix, size_t element_size, uint32_t num_rows, uint32_t num_cols) {
 519         std::vector<uint8_t> temp_buffer(num_rows * num_cols * element_size);
 520         for (uint32_t i = 0; i < num_rows; i++) {
 521             for (uint32_t j = 0; j < num_cols; j++) {
 522                     ie_memcpy(&temp_buffer.front() + (j*num_rows + i)*element_size,
 523                           temp_buffer.size() - (i * num_cols + j) * element_size,
 524                           ptr_matrix + (i*num_cols+j)*element_size,
 525                           element_size);
 526             }
 527         }
 528         return temp_buffer;
 529     };
 530
 531     std::vector<uint8_t > transposedWeights;
 532     for (uint32_t k = 0; k < convolution._out_depth; k++) {
 533         uint8_t *ptr_filt_current
 534             = convolution._weights->cbuffer().as<uint8_t *>() + k * num_columns_in * convolution._kernel[X_AXIS] * convolution.precision.size();
 535         auto transposedPart = TransposeMatrix(ptr_filt_current, convolution.precision.size(), num_columns_in, convolution._kernel[X_AXIS]);
 536         transposedWeights.insert(transposedWeights.end(), transposedPart.begin(), transposedPart.end());
 537     }
 538
 539     if (num_padding == 0) {
 540         gnamem->readonly().push_local_ptr(ptr_weights, transposedWeights.data(), convolution._weights->byteSize(), 64);
 541     } else {
 542         auto elementsIn = convolution._kernel_x * num_feature_map_columns + num_padding;
 543         auto paddedWeights = elementsIn * convolution._out_depth;
 544         auto paddedWeightsSize = paddedWeights * convolution.precision.size();
 545         auto elements_in_row = convolution._kernel_x * num_feature_map_columns;
 546         gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
 547             for (int i = 0; i < convolution._out_depth; i++) {
 548                 memcpy(data,
 549                        transposedWeights.data() + elements_in_row * i * convolution.precision.size(),
 550                        elements_in_row * convolution.precision.size());
 551
 552                 data = reinterpret_cast<uint8_t *>(data) + elementsIn * convolution.precision.size();
 553             }
 554         }, 64);
 555     }
 556
 557     if (convolution._biases) {
 558         gnamem->readonly().push_ptr(ptr_biases,
 559                                     convolution._biases->cbuffer().as<const void *>(),
 560                                     convolution._biases->byteSize(),
 561                                     64);
 562     } else {
 563         gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
 564     }
 565 }
 566
 567 void GNAPlugin::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
 568     auto &power = dynamic_cast<PowerLayer &>(*layer.get());
 569     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 570
 571     if (power.power != 1.0) {
 572         THROW_IE_EXCEPTION << "[GNA plugin] unsupported power factor, expected 1 but was " << power.power;
 573     }
 574
 575     auto input = layer->insData[0].lock();
 576
 577     auto outputs = *layer->outData.begin();
 578
 579     uint32_t num_rows_in = FROM_IR_DIM(input, 1);
 580     uint32_t num_columns_in = FROM_IR_DIM(input, 2);
 581     uint32_t num_rows_out = num_rows_in;
 582
 583     void *ptr_inputs;
 584     void *ptr_outputs;
 585     void *ptr_weights;
 586     void *ptr_biases;
 587
 588     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 589     auto &currentComponent = dnnComponentsForLayer.back().second;
 590     dnn.InitAffineComponent(currentComponent,
 591                             num_rows_in,
 592                             num_columns_in,
 593                             num_rows_out,
 594                             input->precision.size(),
 595                             outputs->precision.size(),
 596                             // TODO: only fp32 and Int16 tested
 597                             quantized == nullptr ? input->precision.size() : 2,
 598                             quantized == nullptr ? input->precision.size() : 4,
 599                             quantized == nullptr ? 1 : quantized->_weights_quant.scale,
 600                             quantized == nullptr ? 1 : quantized->_dst_quant.scale,
 601                             ptr_inputs,
 602                             ptr_outputs,
 603                             ptr_weights,
 604                             ptr_biases,
 605                             true);
 606
 607 #ifdef PLOT
 608     cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
 609 #endif
 610
 611     size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
 612         * outputs->precision.size();
 613
 614     size_t num_data_bytes_in = InferenceEngine::details::product(begin(input->dims), end(input->dims))
 615         * input->precision.size();
 616
 617     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 618     connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
 619
 620     if (power.scale != 1.0f) {
 621         if (quantized == nullptr) {
 622             gnamem->readonly().push_value(ptr_weights, power.scale, num_rows_out, 64);
 623         } else {
 624             auto scaledIdentity = quantized->_weights_quant.scale * power.scale;
 625
 626             #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
 627
 628             auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
 629             gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
 630         }
 631     }
 632
 633     if (power.offset != 0.0f) {
 634         if (quantized == nullptr) {
 635             gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
 636         } else {
 637             gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
 638         }
 639     } else {
 640         gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
 641     }
 642 }
 643
 644 void GNAPlugin::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
 645     auto &pooling = dynamic_cast<PoolingLayer &>(*layer.get());
 646     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 647
 648     auto inputs = layer->insData.begin()->lock();
 649     auto outputs = *layer->outData.begin();
 650
 651     uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
 652     uint32_t num_columns_in = FROM_IR_DIM(inputs, 3);
 653     uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
 654     uint32_t num_columns_out = FROM_IR_DIM(outputs, 3);
 655     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 656
 657     void *ptr_inputs;
 658     void *ptr_outputs;
 659
 660     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 661     auto &currentComponent = dnnComponentsForLayer.back().second;
 662
 663 #ifdef PLOT
 664     cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n";
 665 #endif
 666     switch (pooling._type) {
 667         case PoolingLayer::MAX: break;
 668         // we are loosing precision here
 669         case PoolingLayer::AVG:
 670         default:
 671             // TODO: convert to SUMM pooling
 672             THROW_GNA_EXCEPTION << "Layer :" << layer->name << " not supported";
 673     }
 674
 675     dnn.InitMaxpoolComponent(currentComponent,
 676                             1,
 677                             num_columns_in * num_rows_in ,
 678                             1,
 679                             num_columns_out * num_rows_out,
 680                             inputs->precision.size(),
 681                             outputs->precision.size(),
 682                             pooling._kernel[X_AXIS],
 683                             pooling._kernel[X_AXIS],
 684                             num_columns_in,
 685                             false,
 686                             quantized == nullptr ? 1 : quantized->_dst_quant.scale,
 687                             ptr_inputs,
 688                             ptr_outputs);
 689
 690     size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
 691         * outputs->precision.size();
 692
 693     size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
 694
 695     connectInput(layer, ptr_inputs, num_data_bytes_in);
 696     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 697 }
 698
 699 void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
 700     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 701
 702     auto inputs = layer->insData.begin()->lock();
 703     auto outputs = *layer->outData.begin();
 704
 705     uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
 706     uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
 707     uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
 708     uint32_t num_columns_out = FROM_IR_DIM(outputs, 2);
 709     uint32_t num_padding_in = ALIGN(num_rows_in, 8) - num_rows_in;
 710     uint32_t num_padding_out = ALIGN(num_rows_out, 8) - num_rows_out;
 711     void *ptr_inputs;
 712     void *ptr_outputs;
 713     auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
 714
 715     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 716     auto &currentComponent = dnnComponentsForLayer.back().second;
 717     dnn.InitCopyComponent(currentComponent,
 718                           orientation,
 719                           ALIGN(num_rows_in, 8),
 720                           num_columns_in,
 721                           ALIGN(num_rows_out, 8),
 722                           num_columns_out,
 723                           inputs->precision.size(),
 724                           outputs->precision.size(),
 725                           quantized == nullptr ? 1 : quantized->_dst_quant.scale,
 726                           num_rows_out + num_padding_out,
 727                           num_columns_out,
 728                           ptr_inputs,
 729                           ptr_outputs);
 730
 731     size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
 732                                                             begin(outputs->dims), end(outputs->dims)), 8)
 733                                                                                 * outputs->precision.size();
 734     size_t num_data_bytes_in = num_columns_in * ALIGN(num_rows_in, 8) * inputs->precision.size();
 735
 736     connectInput(layer, ptr_inputs, num_data_bytes_in);
 737     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 738 }
 739
 740 void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
 741     auto concatLayer = dynamic_cast<InferenceEngine::ConcatLayer *> (layer.get());
 742
 743     if (concatLayer == nullptr) {
 744         return;
 745     }
 746     if (concatLayer->insData.size() != 2) {
 747         THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
 748     }
 749
 750     auto prevInput0 = concatLayer->insData[0].lock();
 751     auto prevInput1 = concatLayer->insData[1].lock();
 752     if (!prevInput0 || !prevInput1) {
 753         THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent";
 754     }
 755     if (prevInput0->precision.size() != prevInput1->precision.size()) {
 756         THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported";
 757     }
 758
 759     auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
 760     for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) {
 761         if ( LayerInfo(outLayer.second).isConcat() ) {
 762             connectOutput(layer, &concatLayerInfo.gna_ptr,
 763                           &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size);
 764         }
 765     }
 766
 767     size_t idx = 0;
 768     for (auto && inputLayer : concatLayerInfo.concatInputLayers) {
 769         if ( InferenceEngine::details::CaselessEq<std::string>()
 770                                             (inputLayer.name, "input") ) {
 771             connectInput(layer, &concatLayerInfo.gna_ptr,
 772                                 concatLayerInfo.reserved_size-inputLayer.offset, static_cast<int32_t>(-inputLayer.offset), idx);
 773         }
 774         ++idx;
 775     }
 776 }
 777
 778 void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
 779     auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (layer.get());
 780
 781     if (cropLayer == nullptr) {
 782         return;
 783     }
 784     if (cropLayer->axis.size() > 1) {
 785         THROW_GNA_EXCEPTION <<
 786         "Crop layer does not support the number of cropped dimentions = "
 787         << cropLayer->axis.size() << ".";
 788     }
 789
 790     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 791     size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
 792     size_t cropOutputSize = cropLayer->dim.back() * cropLayer->precision.size();
 793
 794     if (ALIGN64(cropOffset) == cropOffset) {
 795         // leave crop as it is
 796         GNAPlugin::GNACropLayer cropLayerInfoItem(layer);
 797         std::string& id = layer->name;
 798         crop_connection.emplace(id, cropLayerInfoItem);
 799         auto cropLayerInfo = crop_connection.find(cropLayer->name);
 800
 801         if (cropLayerInfo == crop_connection.end()) {
 802             THROW_GNA_EXCEPTION <<
 803             "Item is not in the storage but it was added recently...\n";
 804         }
 805
 806         // calculate index idx for connectInput last parameter
 807         connectInput(layer, &cropLayerInfo->second.gna_ptr, cropOutputSize + cropOffset, cropOffset, 0);
 808
 809         // cases for certain output layers
 810         for (auto &&outLayer : layer->outData.front()->getInputTo()) {
 811             auto& nextLayer = outLayer.second;
 812             if ( LayerInfo(nextLayer).isConcat() ) {
 813                 connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropOutputSize);
 814             }
 815         }
 816     } else {
 817         gnalog() << "Crop " << layer->name << " is being replaced by Affine layer...\n";
 818         auto outputs = *layer->outData.begin();
 819         auto inputs = layer->insData.begin()->lock();
 820
 821         uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
 822         uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
 823         uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
 824         uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 825
 826         void *ptr_inputs;
 827         void *ptr_outputs;
 828         void *ptr_weights;
 829         void *ptr_biases;
 830
 831         dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 832         auto &currentComponent = dnnComponentsForLayer.back().second;
 833         dnn.InitAffineComponent(currentComponent,
 834                                 num_rows_in + num_padding,
 835                                 num_columns_in,
 836                                 num_rows_out,
 837                                 inputs->precision.size(),
 838                                 4,
 839                                 quantized == nullptr ? inputs->precision.size() : 2,
 840                                 4,
 841                                 quantized == nullptr ? 1 : quantized->_weights_quant.scale,
 842                                 quantized == nullptr ? 1 : quantized->_dst_quant.scale,
 843                                 ptr_inputs,
 844                                 ptr_outputs,
 845                                 ptr_weights,
 846                                 ptr_biases,
 847                                 false);
 848
 849         size_t num_data_bytes_out =
 850         InferenceEngine::details::product(
 851                                           begin(outputs->dims), end(outputs->dims)) * 4;
 852
 853         size_t num_data_bytes_in = num_columns_in *
 854                 ALIGN(num_rows_in, 8) * inputs->precision.size();
 855
 856         connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
 857         connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 858
 859         FillWeightOfAligningFilter(layer, ptr_weights, cropLayer->offset.back(), (quantized == nullptr) ? false : true);
 860
 861         (quantized == nullptr) ?
 862             gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64):
 863             gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
 864     }
 865 }
 866
 867 void GNAPlugin::SplitPrimitive(InferenceEngine::CNNLayerPtr layer) {
 868 //  Nothing to do
 869 }
 870
 871 void GNAPlugin::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) {
 872 //  Nothing to do
 873 }
 874
 875 void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
 876     auto &eltwise = dynamic_cast<EltwiseLayer &>(*layer.get());
 877     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 878
 879     // for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that
 880     auto inputs2Bytes = layer->insData[0].lock();
 881     auto inputs4Bytes = layer->insData[1].lock();
 882
 883     int biasesLayerIdx = 1;
 884
 885     if (quantized) {
 886         if (eltwise._operation == EltwiseLayer::Sum) {
 887             if (inputs4Bytes->precision.size() != 4) {
 888                 std::swap(inputs4Bytes, inputs2Bytes);
 889                 biasesLayerIdx = 0;
 890             }
 891             IE_ASSERT(inputs2Bytes->precision.size() == 2);
 892             IE_ASSERT(inputs4Bytes->precision.size() == 4);
 893         } else {
 894             // for mul both inputs should be 2 bytes precision
 895             IE_ASSERT(inputs2Bytes->precision.size() == 2);
 896             IE_ASSERT(inputs4Bytes->precision.size() == 2);
 897         }
 898     }
 899
 900     auto outputs = *layer->outData.begin();
 901
 902     uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1);
 903     uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2);
 904     uint32_t num_rows_out = num_rows_in;
 905     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 906
 907     void *ptr_inputs;
 908     void *ptr_outputs;
 909     void *ptr_weights;
 910     void *ptr_biases;
 911
 912     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 913     auto &currentComponent = dnnComponentsForLayer.back().second;
 914     dnn.InitAffineComponent(currentComponent,
 915                             num_rows_in + num_padding,
 916                             num_columns_in,
 917                             num_rows_out + num_padding,
 918                             inputs2Bytes->precision.size(),
 919                             outputs->precision.size(),
 920                             // TODO: only fp32 and Int16 tested
 921                             quantized == nullptr ? inputs2Bytes->precision.size() : 2,
 922                             quantized == nullptr ? inputs4Bytes->precision.size() : 4,
 923                             quantized == nullptr ? 1 : quantized->_weights_quant.scale,
 924                             quantized == nullptr ? 1 : quantized->_dst_quant.scale,
 925                             ptr_inputs,
 926                             ptr_outputs,
 927                             ptr_weights,
 928                             ptr_biases,
 929                             true);
 930
 931 #ifdef PLOT
 932     cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
 933 #endif
 934
 935     size_t num_data_bytes_out =
 936         InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) * outputs->precision.size();
 937
 938     size_t num_data_bytes_in =
 939         num_columns_in * (num_rows_in + num_padding) * inputs2Bytes->precision.size();
 940
 941     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 942     connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx);
 943
 944     switch (eltwise._operation) {
 945         case EltwiseLayer::Sum:
 946             if (quantized == nullptr) {
 947                 gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
 948             } else {
 949                 auto scaledIdentity = quantized->_weights_quant.scale;
 950
 951                 #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
 952
 953                 auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
 954
 955                 gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
 956             }
 957             connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
 958             break;
 959
 960         case EltwiseLayer::Prod:
 961             if (quantized == nullptr) {
 962                 gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
 963             } else {
 964                 gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
 965             }
 966             connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
 967             break;
 968
 969         default:
 970             THROW_GNA_EXCEPTION << "Unsupported eltwise operation: " << eltwise._operation;
 971     }
 972 }
 973
 974 void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) {
 975     auto &weightable = dynamic_cast<WeightableLayer &>(*layer.get());
 976     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
 977
 978     auto inputs = layer->insData.begin()->lock();
 979     auto outputs = *layer->outData.begin();
 980
 981     uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
 982     uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
 983     uint32_t num_rows_out = isDiag ? num_rows_in : FROM_IR_DIM(outputs, 1);
 984     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 985
 986     void *ptr_inputs;
 987     void *ptr_outputs;
 988     void *ptr_weights;
 989     void *ptr_biases;
 990
 991     // TODO: questionable why for biases that are no in IR we inventing precision
 992     auto biasPrecision = weightable._biases ? weightable._biases->precision() : outputs->precision;
 993
 994     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
 995     auto &currentComponent = dnnComponentsForLayer.back().second;
 996
 997 #ifdef PLOT
 998     cout << "IR layer : " << std::left << std::setw(20) << layer->name << (isDiag ? "diagonal_" : "affine_") << dnnComponentsForLayer.size() - 1 << "\n";
 999 #endif
1000
1001     dnn.InitAffineComponent(currentComponent,
1002                             num_rows_in + num_padding,
1003                             num_columns_in,
1004                             num_rows_out,
1005                             inputs->precision.size(),
1006                             outputs->precision.size(),
1007                             weightable._weights->precision().size(),
1008                             biasPrecision.size(),
1009                             quantized == nullptr ? 1 : quantized->_weights_quant.scale,
1010                             quantized == nullptr ? 1 : quantized->_dst_quant.scale,
1011                             ptr_inputs,
1012                             ptr_outputs,
1013                             ptr_weights,
1014                             ptr_biases,
1015                             isDiag);
1016
1017     size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
1018         * outputs->precision.size();
1019
1020     size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
1021
1022     auto connectionInfo = connectInput(layer, ptr_inputs, num_data_bytes_in);
1023     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
1024
1025     auto transpose = false;
1026     auto transposedRows = 0;
1027     auto transposedCols = 0;
1028
1029     if (0 && connectionInfo.needTransposeWeights) {
1030         // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1
1031         auto permuteOrder = connectionInfo.permute->GetParamAsInts("order");
1032         if (permuteOrder != vector<int>({0, 3, 2, 1})) {
1033             THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
1034                                ", but only support 0, 3, 2, 1";
1035         }
1036
1037         /**
1038          * TODO: weights transpose happened after quantisation might result in poor quality for in 8 - move this to passes
1039          */
1040         if (weightable._weights->precision() == Precision::I8) {
1041             THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute operation for 8 bit weights for layer: " << layer->name;
1042         }
1043
1044         // this affine connected to convolution via pool or activation
1045         gnalog() << "Transposing weights for layer: " << layer->name << "\n";
1046
1047         transpose = !isDiag;
1048         transposedRows = connectionInfo.permute->input()->getDims()[3];
1049         transposedCols = connectionInfo.permute->input()->getDims()[1];
1050     }
1051
1052     if (num_padding == 0) {
1053         if (!transpose) {
1054             gnamem->readonly().push_ptr(ptr_weights,
1055                                         weightable._weights->cbuffer().as<const void *>(),
1056                                         weightable._weights->byteSize(),
1057                                         64);
1058         } else {
1059             gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) {
1060                 for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
1061                     auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
1062                     auto cbuffer = weightable._weights->cbuffer().as<const uint8_t *>() + rowOffset;
1063                     auto u8Data = reinterpret_cast<uint8_t *>(data) + rowOffset;
1064                     for (int j = 0; j < transposedCols; j++) {
1065                         for (int i = 0; i < transposedRows; i++) {
1066                             auto offsetWrite = (transposedRows * j + i) * weightable.precision.size();
1067                             auto offsetRead = (i * transposedCols + j) * weightable.precision.size();
1068                             std::memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
1069                         }
1070                     }
1071                 }
1072             }, 64);
1073         }
1074     } else {
1075         if (transpose) {
1076             THROW_GNA_EXCEPTION << "transpozed weights with non zero padding not yet supported";
1077         }
1078         auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
1079         auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
1080         auto paddedWeightsSize = paddedWeights * weightable.precision.size();
1081
1082         gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
1083             for (int i = 0; i < (isDiag ? 1 : num_rows_out); i++) {
1084                 memcpy(data,
1085                        weightable._weights->cbuffer().as<const uint8_t *>() + num_rows_in * i * weightable.precision.size(),
1086                        num_rows_in * weightable.precision.size());
1087                 data = reinterpret_cast<uint8_t *>(data) + (num_rows_in + num_padding) * weightable.precision.size();
1088             }
1089         }, 64);
1090     }
1091
1092     if (weightable._biases) {
1093         gnamem->readonly().push_ptr(ptr_biases,
1094                          weightable._biases->cbuffer().as<const void *>(),
1095                          weightable._biases->byteSize(),
1096                          64);
1097     } else {
1098         gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
1099     }
1100 }
1101
1102 void GNAPlugin::FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized) {
1103     auto outputs = *layer->outData.begin();
1104     auto inputs = layer->insData.begin()->lock();
1105
1106     uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
1107     uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
1108
1109     if (!ptrWeights) {
1110         THROW_GNA_EXCEPTION << "Weights memory is not allocated!!!";
1111     }
1112
1113     gnamem->readonly().push_initializer(ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void * data, size_t size) {
1114         int out = 0;
1115         for (int input = offset; input < num_rows_out + offset; ++input) {
1116             auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * ALIGN(num_rows_in, 8) * layer->precision.size();
1117             if (!isQuantized) {
1118                 auto float_ptr = reinterpret_cast<float *>(mem_ptr);
1119                 *float_ptr = 1.0f;
1120            } else {
1121                auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
1122                *int_ptr = 1;
1123            }
1124             ++out;
1125         }
1126     }, 64);
1127 }
1128
1129 void GNAPlugin::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer) {
1130     auto filterLayer = dynamic_cast<InferenceEngine::WeightableLayer *> (layer.get());
1131
1132     if (filterLayer == nullptr) {
1133         return;
1134     }
1135
1136     std::string& name = filterLayer->name;
1137     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
1138
1139     // we look for this concat layer pointer in extra concat map
1140     auto prevLayer = CNNNetPrevLayer(layer.get(), 0);
1141     if (!LayerInfo(prevLayer).isSplit() && !LayerInfo(prevLayer).isSlice()) {
1142         THROW_GNA_EXCEPTION << "Case  with Affine Aligning Filter for not Split/Slice layers is not implemented yet!";
1143     }
1144
1145     void *ptr_inputs;
1146     void *ptr_outputs;
1147     void *ptr_weights;
1148     void *ptr_biases;
1149
1150     auto outputs = *layer->outData.begin();
1151     auto inputs = layer->insData.begin()->lock();
1152
1153     uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
1154     uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
1155     uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
1156
1157     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
1158
1159     gnalog() << "Filter " << layer->name << " is being inserted...\n";
1160     auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->precision() : outputs->precision;
1161     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
1162     auto &currentComponent = dnnComponentsForLayer.back().second;
1163     dnn.InitAffineComponent(currentComponent,
1164                             num_rows_in + num_padding,
1165                             num_columns_in,
1166                             num_rows_out,
1167                             inputs->precision.size(),
1168                             outputs->precision.size(),
1169                             filterLayer->_weights->precision().size(),
1170                             biasPrecision.size(),
1171                             quantized == nullptr ? 1 : quantized->_weights_quant.scale,
1172                             quantized == nullptr ? 1 : quantized->_dst_quant.scale,
1173                             ptr_inputs,
1174                             ptr_outputs,
1175                             ptr_weights,
1176                             ptr_biases,
1177                             false);
1178
1179     size_t num_data_bytes_out =
1180                 InferenceEngine::details::product(
1181                                         begin(outputs->dims), end(outputs->dims)) * 4;
1182
1183     size_t num_data_bytes_in = num_columns_in *
1184                             ALIGN(num_rows_in, 8) * inputs->precision.size();
1185
1186     connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
1187     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
1188
1189     if (num_padding == 0) {
1190         gnamem->readonly().push_ptr(ptr_weights,
1191                                 filterLayer->_weights->cbuffer().as<const void *>(),
1192                                 filterLayer->_weights->byteSize(),
1193                                                             64);
1194     } else {
1195         auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
1196         auto paddedWeights = elementsIn * num_rows_out;
1197         auto paddedWeightsSize = paddedWeights * filterLayer->precision.size();
1198
1199         gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
1200             for (int i = 0; i < num_rows_out; i++) {
1201                 std::memcpy(data,
1202                        filterLayer->_weights->cbuffer().as<const uint8_t *>() + num_rows_in * i * filterLayer->precision.size(),
1203                        num_rows_in * filterLayer->precision.size());
1204                 data = reinterpret_cast<uint8_t *>(data) + (num_rows_in + num_padding) * filterLayer->precision.size();
1205             }
1206         }, 64);
1207     }
1208
1209     if (filterLayer->_biases) {
1210         gnamem->readonly().push_ptr(ptr_biases,
1211                          filterLayer->_biases->cbuffer().as<const void *>(),
1212                          filterLayer->_biases->byteSize(),
1213                          64);
1214     } else {
1215         gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
1216     }
1217 }
1218
1219 void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
1220     auto *generic = dynamic_cast<GenericLayer *>(layer.get());
1221     std::string type;
1222     std::vector<intel_pwl_segment_t> ptr_pwl_segments;
1223     uint32_t num_rows;
1224     uint32_t num_columns;
1225     void *ptr_inputs;
1226     void *ptr_outputs;
1227
1228     do {
1229         if (generic == nullptr) {
1230             type = layer->type;
1231             break;
1232         }
1233
1234         if (CaselessEq<string>()(layer->type, "activation")) {
1235             type = generic->GetParamAsString("type");
1236             break;
1237         } else {
1238             type = layer->type;
1239             break;
1240         }
1241     } while (false);
1242
1243     auto inputs = layer->insData.begin()->lock();
1244     auto outputs = *layer->outData.begin();
1245     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
1246     float output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
1247
1248     auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
1249
1250     if (inputs->dims.size() == 4) {
1251         num_columns = FROM_IR_DIM(inputs, 3) * FROM_IR_DIM(inputs, 1);
1252         num_rows = 1;
1253     } else {
1254         num_columns = FROM_IR_DIM(inputs, 2);
1255         num_rows = FROM_IR_DIM(inputs, 1);
1256     }
1257
1258     size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
1259         * outputs->precision.size();
1260
1261     size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs->dims), end(inputs->dims))
1262         * inputs->precision.size();
1263
1264     static caseless_unordered_map<std::string, DnnActivationType> supportedActivations = {
1265         {"sigmoid", kActSigmoid},
1266         {"tanh", kActTanh},
1267         {"relu", kActRelu},
1268         {"leakyrelu", kActLeakyRelu},
1269         {"clamp", kActKaldiLstmClipping},
1270         {"identity", kActIdentity}
1271     };
1272
1273     auto it = supportedActivations.find(type);
1274     if (it == supportedActivations.end()) {
1275         THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type;
1276     }
1277     auto activation_type = DnnActivation::fromType(it->second);
1278     activation_type.negative_slope = (it->second == kActRelu) ? dynamic_cast<ReLULayer*>(layer.get())->negative_slope : 0.0f;
1279
1280     // TODO: need to take graph dependency instead of linear
1281     auto &prevComponent = dnnComponentsForLayer.back().second;
1282     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
1283     auto &currentComponent = dnnComponentsForLayer.back().second;
1284
1285     intel_pwl_segment_t *ptr_pwl_segments_target = nullptr;
1286
1287     if (!inputs->precision.is_float()) {
1288         // TODO: generalize activation function code
1289         // now that scale factors are known, create PWL approximations to activation functions
1290         float input_scale_factor = dnn.OutputScaleFactor(prevComponent);
1291         if (uniformPwlDesign) {
1292             switch (activation_type) {
1293                 case kActSigmoid:ptr_pwl_segments.resize(SIGMOID_NUM_SEGMENTS);
1294                     break;
1295                 case kActTanh:ptr_pwl_segments.resize(TANH_NUM_SEGMENTS);
1296                     break;
1297                 case kActRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS);
1298                     break;
1299                 case kActLeakyRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS);
1300                     break;
1301                 case kActKaldiLstmClipping:
1302                 case kActIdentity:ptr_pwl_segments.resize(IDENTITY_NUM_SEGMENTS);
1303                     break;
1304                 case kActCustom:
1305                 default:THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type;
1306             }
1307             PwlDesign16(activation_type,
1308                         &*ptr_pwl_segments.begin(),
1309                         static_cast<uint32_t>(ptr_pwl_segments.size()),
1310                         input_scale_factor,
1311                         output_scale_factor);
1312         } else {
1313             PwlDesignOpt16(activation_type,
1314                            ptr_pwl_segments,
1315                            input_scale_factor,
1316                            output_scale_factor);
1317         }
1318         ptr_pwl_segments_target = reinterpret_cast<intel_pwl_segment_t *>(&ptr_pwl_segments_target);
1319     }
1320
1321     dnn.InitPiecewiseLinearComponent(currentComponent,
1322                                      activation_type,
1323                                      orientation,
1324                                      num_rows,
1325                                      num_columns,
1326                                      inputs->precision.size(),
1327                                      outputs->precision.size(),
1328                                      ptr_pwl_segments.size(),
1329                                      output_scale_factor,
1330                                      ptr_inputs,
1331                                      ptr_outputs,
1332                                      ptr_pwl_segments_target);
1333 #ifdef PLOT
1334 #define GET_ACTIVATION_NAME(name)\
1335 case name:\
1336     actName = #name;\
1337     break;
1338     string actName = "unknown";
1339     switch (activation_type) {
1340         GET_ACTIVATION_NAME(kActSigmoid);
1341         GET_ACTIVATION_NAME(kActTanh);
1342         GET_ACTIVATION_NAME(kActRelu);
1343         GET_ACTIVATION_NAME(kActLeakyRelu);
1344         GET_ACTIVATION_NAME(kActKaldiLstmClipping);
1345         GET_ACTIVATION_NAME(kActIdentity);
1346     }
1347     cout << "IR layer : " << std::left << std::setw(20) << layer->name <<  actName << "_" << dnnComponentsForLayer.size() - 1 <<"\n";
1348 #endif
1349
1350     connectInput(layer, ptr_inputs, num_data_bytes_in);
1351     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
1352
1353     if (ptr_pwl_segments_target != nullptr) {
1354         gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
1355                                           &ptr_pwl_segments.front(),
1356                                           ptr_pwl_segments.size() * sizeof(intel_pwl_segment_t),
1357                                           64);
1358     }
1359 }
1360
1361
1362 void GNAPlugin::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
1363     auto layerOrder = layer->GetParamAsInts("order");
1364
1365     if (layerOrder != vector<int>({0, 3, 2, 1})) {
1366         THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
1367                            ", but only support 0,3,2,1";
1368     }
1369 }
1370
1371 class LayersBuilder {
1372     using CreatorFnc = std::function<void(GNAPlugin*, CNNLayerPtr)>;
1373
1374  public:
1375     LayersBuilder(const std::vector<std::string> &types, CreatorFnc callback) {
1376         for (auto && str : types) {
1377             getStorage()[str] = callback;
1378         }
1379     }
1380     static caseless_unordered_map<std::string, CreatorFnc> &getStorage() {
1381         static caseless_unordered_map<std::string, CreatorFnc> LayerBuilder;
1382         return LayerBuilder;
1383     }
1384 };
1385
1386 #define CREATE(name) [](GNAPlugin *p, CNNLayerPtr l) {p->name(l);}
1387 void SKIP(GNAPlugin*, CNNLayerPtr) {}
1388
1389 void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
1390     static const LayersBuilder layersBuilder[] = {
1391         {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}},  // skip input layers they are not used in GNA lib, only as a memory blobs
1392         {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)},
1393         {{"ScaleShift"}, CREATE(DiagonalPrimitive)},
1394         {{"AffineFilter"}, CREATE(AffineFilterPrimitive)},
1395         {{"Eltwise"},
1396          CREATE(EltwisePrimitive)},  // same as diagonal while weights are not taken from network, rather than from another output
1397         {{"Split"}, SKIP},  // skip information about which part of prev layer need to consume handle during layer creation
1398         {{"Slice"}, SKIP},
1399         {{"clamp", "sigmoid", "relu", "tanh", "identity"}, CREATE(PWLPrimitive)},
1400         {{"Convolution"}, CREATE(ConvolutionPrimitive)},
1401         {{"Permute"}, CREATE(PermutePrimitive)},  // permute of certain form (2D transpose) can be assimilated in followed FC layer
1402         {{"Pooling"}, CREATE(PoolingPrimitive)},
1403         {{"Power"} , CREATE(PowerPrimitive)},
1404         {{"Concat"}, CREATE(ConcatPrimitive)},
1405         {{"Reshape"}, SKIP},  // TODO: handled not in GNA but rather in GNA plugin
1406         {{"Crop"}, CREATE(CropPrimitive)},
1407         {{"Copy"}, CREATE(CopyPrimitive)},
1408     };
1409     auto it = LayersBuilder::getStorage().find(layer->type);
1410     if (it != LayersBuilder::getStorage().end()) {
1411         it->second(this, layer);
1412     } else {
1413         THROW_GNA_EXCEPTION << "Unsupported layer: " << layer->name << ":" << layer->type;
1414     }
1415 }
1416
1417
1418 GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
1419     SetConfig(configMap);
1420 }
1421
1422 GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) const {
1423     static const caseless_map<std::string, GNAPlugin::LayerType> LayerNameToType = {
1424         { "Input" , Input },
1425         { "Convolution" , Convolution },
1426         { "ReLU" , ReLU },
1427         { "Sigmoid" , Sigmoid },
1428         { "TanH" , TanH },
1429         { "Pooling" , Pooling },
1430         { "FullyConnected" , FullyConnected },
1431         { "InnerProduct" , InnerProduct},
1432         { "Split" , Split },
1433         { "Slice" , Slice },
1434         { "Eltwise" , Eltwise },
1435         { "Reshape" , Reshape },
1436         { "ScaleShift" , ScaleShift },
1437         { "Clamp" , Clamp },
1438         { "Concat" , Concat },
1439         { "Copy", Copy },
1440         { "Permute" , Permute },
1441         { "Power" , Power},
1442         { "Memory" , Memory },
1443         { "Crop" , Crop }
1444     };
1445     auto it = LayerNameToType.find(str);
1446     if (it != LayerNameToType.end())
1447         return it->second;
1448     else
1449         return NO_TYPE;
1450 }
1451
1452 bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage) {
1453     CNNLayerSet inputLayers;
1454     InferenceEngine::InputsDataMap inputs;
1455     std::unordered_set<CNNLayer *> allLayers;
1456     auto specifiedDevice = network.getTargetDevice();
1457     auto network_precision = network.getPrecision();
1458     network.getInputsInfo(inputs);
1459     auto network_input_precision = inputs.begin()->second->getInputPrecision();
1460     auto batch_size = network.getBatchSize();
1461     if (network_precision != Precision::FP32) {
1462         errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n";
1463         return false;
1464     }
1465     if (network_input_precision != Precision::FP32 &&
1466         network_input_precision != Precision::I16 &&
1467         network_input_precision != Precision::U8) {
1468         errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n";
1469         return false;
1470     }
1471     if (specifiedDevice != InferenceEngine::TargetDevice::eCPU &&
1472         specifiedDevice != InferenceEngine::TargetDevice::eGNA &&
1473         specifiedDevice != InferenceEngine::TargetDevice::eDefault) {
1474         errMessage = "The plugin does not support target device: " + std::string(getDeviceName(specifiedDevice)) + ".\n";
1475         return false;
1476     }
1477
1478     if (inputs.empty()) {
1479         errMessage = "Network is empty (GNA)\n";
1480         return false;
1481     }
1482
1483     auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
1484     if (secondLayers.empty()) {
1485         errMessage = "Network consists of input layer only (GNA)\n";
1486         return false;
1487     }
1488
1489     bool check_result = true;
1490     InferenceEngine::details::UnorderedDFS(allLayers,
1491                                            secondLayers.begin()->second,
1492                                            [&](const CNNLayerPtr layer) {
1493                                                 if (LayerTypeFromStr(layer->type) == NO_TYPE) {
1494                                                     errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n";
1495                                                     check_result =  false;
1496                                                 }
1497                                                 if (batch_size != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
1498                                                     errMessage = "topology with layer: " + layer->name + ", type: " + layer->type +
1499                                                                  ", and batch size(" + to_string(batch_size) + ") != 1 not supported";
1500                                                     check_result =  false;
1501                                                 }
1502                                             }, false);
1503
1504     return check_result;
1505 }
1506
1507 float GNAPlugin::get_input_scale_factor() const {
1508     return input_scale_factor.empty() ? 1.0 : input_scale_factor.begin()->second;
1509 }
1510
1511 void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
1512     //  Check the input network
1513     std::string error;
1514     if (!AreLayersSupported(network, error)) {
1515         THROW_GNA_EXCEPTION << error.c_str();
1516     }
1517
1518     // network optimisation phases
1519     auto run_passes = [&] (CNNNetPtr network) {
1520         auto layers = CNNNetSortTopologically(*network.get());
1521         substitutePRelu(layers);
1522         layers = CNNNetSortTopologically(*network.get());
1523         reorderMaxPool(layers);
1524         //  ToDo sort if bool flag "changed"
1525         //  returned from insertion function
1526         insertAligningFilterLayer(layers);
1527
1528 #if ENABLE_AUTO_PERMUTE
1529         layers = CNNNetSortTopologically(*network.get());
1530         reversePermutations(layers);
1531 #endif
1532         layers = CNNNetSortTopologically(*network.get());
1533         insertIdentityLayer(layers);
1534         layers = CNNNetSortTopologically(*network.get());
1535         insertCopyLayer(layers);
1536         layers = CNNNetSortTopologically(*network.get());
1537         insertDiagonalLayer(layers);
1538         layers = CNNNetSortTopologically(*network.get());
1539         substituteScaleShiftBroadCast(layers);
1540     };
1541
1542     Config supported = Config({
1543         {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr {
1544             if (gnaPrecision == Precision::I16) {
1545                 ModelQuantizer<QuantI16> q;
1546                 return q.quantize(network, run_passes, get_input_scale_factor());
1547             }
1548
1549             if (gnaPrecision == Precision::I8) {
1550                 ModelQuantizer<QuantI8> q;
1551                 return q.quantize(network, run_passes, get_input_scale_factor());
1552             }
1553             THROW_GNA_EXCEPTION << "no mans land for GNA precision";
1554         }},
1555         // TODO: need to have advanced precision matcher based on layers/biases
1556         {TargetDevice::eGNA, Precision::MIXED},
1557         {TargetDevice::eGNA, Precision::I16},
1558         {TargetDevice::eCPU, Precision::FP32
1559 #define EMULATE_GNA_API_LAYERS
1560 #ifdef  EMULATE_GNA_API_LAYERS
1561             , [&](InferenceEngine::ICNNNetwork & network) {
1562             auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
1563                 return lp;
1564             };
1565             auto copiedNet = InferenceEngine::CNNNetCopy(network, visitor);
1566             run_passes(copiedNet);
1567
1568             return copiedNet;
1569         }
1570 #endif
1571     }
1572     });
1573
1574     supported.setDefaultDevice(TargetDevice::eGNA);
1575     auto newNet = supported.find_configuration(network).convert(network);
1576
1577
1578
1579     // creating intel dnn_t structures from network
1580     auto sortedNet = CNNNetSortTopologically(*newNet);
1581     std::vector<CNNLayerPtr> sortedNoMem;
1582     std::unordered_map<std::string, std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
1583     // find all memory layers pairs and mark which one used as outputs
1584     for (auto &layer : sortedNet) {
1585         auto generic = dynamic_cast<GenericLayer *>(layer.get());
1586         if (generic == nullptr) {
1587             sortedNoMem.push_back(layer);
1588             continue;
1589         }
1590         LayerInfo layerInfo(layer);
1591         if (layerInfo.isMemory()) {
1592             // collect all memory pairs
1593             auto id = generic->GetParamAsString("id");
1594             memoryPairs[id].resize(generic->GetParamAsInt("size"));
1595             memoryPairs[id][generic->GetParamAsInt("index")] = layer;
1596             continue;
1597         } else if (layerInfo.isConcat()) {
1598             fillConcatConnections(layer);
1599         } else if (layerInfo.isSplit() || layerInfo.isSlice()) {
1600             fillSplitConnections(layer);
1601         }
1602         sortedNoMem.push_back(layer);
1603     }
1604
1605     // fill in extra storage with memory layers
1606     fillMemoryConnections(memoryPairs);
1607
1608     if (memory_connection.size() != 0) {
1609         gna_lib_async_threads_num = 1;
1610     }
1611
1612     auto networkPrecision = newNet->getPrecision();
1613
1614     if (!networkPrecision.is_float()) {
1615         gnadevice.reset(new GNADeviceHelper(gna_proc_type,
1616                                             gna_lib_async_threads_num,
1617                                             gna_openmp_multithreading,
1618                                             performance_counting));
1619         gnamem.reset(new gna_memory_type(
1620                 make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
1621     } else {
1622         gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
1623     }
1624
1625     // keep inputs information and create input primitives
1626     newNet->getInputsInfo(inputsDataMap);
1627     if (inputsDataMap.empty()) {
1628         THROW_GNA_EXCEPTION << " No inputs for the topology";
1629     }
1630
1631     // keep output dims
1632     newNet->getOutputsInfo(outputsDataMap);
1633     if (outputsDataMap.empty()) {
1634         THROW_GNA_EXCEPTION << "No outputs for the topology";
1635     }
1636     if (outputsDataMap.size() != 1) {
1637         THROW_GNA_EXCEPTION << "cannot infer topologies with more than one output";
1638     }
1639     outputDims = outputsDataMap.begin()->second->dims;
1640
1641     for (auto && input : inputsDataMap) {
1642         get_ptr_inputs_global(input.first).resize(gna_lib_async_threads_num);
1643     }
1644
1645     ptr_outputs_global.resize(gna_lib_async_threads_num);
1646     // CreatingLayer primitives
1647     // TODO: solely gna_example convolution hack
1648     num_feature_maps = 1;
1649     for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) {
1650         CreateLayerPrimitive(*layer);
1651     }
1652     DnnComponentsForLayer::iterator output_component = std::find_if(dnnComponentsForLayer.begin(),
1653                                                         dnnComponentsForLayer.end(),
1654                                                         [&](const std::pair<std::string, intel_dnn_component_t>& v)
1655                                                         { return outputsDataMap.begin()->first == v.first; });
1656
1657     if (output_component == dnnComponentsForLayer.end()) {
1658         if (dnnComponentsForLayer.empty()) {
1659             THROW_GNA_EXCEPTION << "No outputs found in internal structures";
1660         }
1661         // likely layer is fused. Take last one
1662         output_component = std::prev(dnnComponentsForLayer.end());
1663         gnalog() << "Output layer "<< outputsDataMap.begin()->first
1664                     << " has not been found in component list. Took  "
1665                     << output_component->first << " instead \n" << std::flush;
1666     }
1667     gnamem->bind_ptr(&ptr_outputs_global.front(), &output_component->second.ptr_outputs);
1668
1669     // make room for active list
1670     gnamem->reserve_ptr(nullptr, ALIGN64(output_component->second.num_bytes_per_output * output_component->second.num_rows_out));
1671
1672     void *pParallelExecutionData  = nullptr;
1673
1674     // reserving more bytes for intermidiate data in parallel case - TODO: this works incorrectly in compact mode at lest
1675     rwSegmentSize = gnamem->getRWBytes();
1676     if (gna_lib_async_threads_num > 1) {
1677         gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gna_lib_async_threads_num - 1));
1678     }
1679
1680     gnamem->commit();
1681
1682     dnn.Init(gnamem->getBasePtr(),
1683              gnamem->getTotalBytes(),
1684              networkPrecision.is_float() ? kDnnFloat : kDnnInt,
1685              1);
1686
1687     // TODO: this copy unneed infact we can directly create gna structs from list
1688     for (auto &element : dnnComponentsForLayer) {
1689         dnn.component.push_back(element.second);
1690     }
1691
1692     // in fp32 mode last PWL cannot be computed without that
1693     dnn.InitActiveList(NULL);
1694
1695     nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap()));
1696
1697     if (!networkPrecision.is_float()) {
1698         // number of layer gets calculated inside that InitGNAStruct function
1699         dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj);
1700     }
1701
1702     // creating same gna RW segment for parallel infer requests
1703     for (int i = 1; i != gna_lib_async_threads_num; i++) {
1704         nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap()));
1705
1706         // this can be improved by just copy all structures, but we are too lazy
1707         dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj);
1708
1709         // relocate rw pointers to new offset
1710         auto basePtr = reinterpret_cast<uint8_t*>(pParallelExecutionData) + rwSegmentSize * (i - 1);
1711
1712         auto relocate = [basePtr, this](void *& ptr_out, void * ptr_in) {
1713             if (ptr_in == nullptr) {
1714                 ptr_out = nullptr;
1715             } else {
1716                 auto offset = reinterpret_cast<uint8_t *>(ptr_in) - reinterpret_cast<uint8_t *>(gnamem->getBasePtr());
1717                 ptr_out = basePtr + offset;
1718             }
1719         };
1720
1721         for (auto &&input : ptr_inputs_global_storage) {
1722             relocate(input[i], input[0]);
1723         }
1724
1725         relocate(ptr_outputs_global[i], ptr_outputs_global[0]);
1726         for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) {
1727             auto & layer = std::get<0>(nnets[i])->obj.pLayers[j];
1728
1729             relocate(layer.pInputs, layer.pInputs);
1730             relocate(layer.pOutputs, layer.pOutputs);
1731             relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate);
1732         }
1733     }
1734
1735     // calculating input orientation without memory layers, since their orientation not changed during infer right now
1736     std::unordered_map<string, string> skippedLayers;
1737     for (auto &layer : sortedNet) {
1738         for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) {
1739             auto prevLayer = CNNNetPrevLayer(layer.get(), i);
1740             if (!skippedLayers.count(prevLayer->name)) {
1741                 if (CNNNetHasPrevLayer(prevLayer.get())) {
1742                     continue;
1743                 }
1744
1745                 // we are in the one of input layers
1746                 if (LayerInfo(prevLayer).isMemory()) {
1747                     continue;
1748                 }
1749             }
1750
1751             auto dnnLayer = findDnnLayer(layer);
1752             string inputName = prevLayer->name;
1753             if (skippedLayers.count(prevLayer->name)) {
1754                 inputName = skippedLayers[prevLayer->name];
1755             }
1756
1757             // non functional layer - skipped by gna
1758             if (nullptr == dnnLayer) {
1759                 // storing input name for skipped layer
1760                 skippedLayers[layer->name] = inputName;
1761                 continue;
1762             }
1763
1764             // input orientation might be already initialized, thus verify that it matches
1765             if (!orientation_in.count(inputName)) {
1766                 orientation_in[inputName] = dnnLayer->orientation_in;
1767             } else {
1768                 if (orientation_in[inputName] != dnnLayer->orientation_in) {
1769                     THROW_GNA_EXCEPTION << "orientation for input layer: " << inputName << "cannot be calculated";
1770                 }
1771             }
1772         }
1773     }
1774
1775     orientation_out = output_component->second.orientation_out;
1776     num_bytes_per_output = output_component->second.num_bytes_per_output;
1777
1778     // find output layer
1779     auto output = std::find_if(sortedNet.begin(),
1780                                 sortedNet.end(),
1781                                 [&](const CNNLayerPtr& v)
1782                                 { return outputsDataMap.begin()->first == v.get()->name; });
1783     if (output == sortedNet.end()) {
1784         // likely layer is fused. Take last one
1785         output = std::prev(sortedNet.end());
1786     }
1787     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(*output);
1788     output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
1789
1790     num_rotate_rows = dnn.num_rotate_rows;
1791     num_rotate_columns = dnn.num_rotate_columns;
1792
1793     DumpXNNToFile();
1794
1795 #ifdef PLOT
1796     dnn.WriteGraphWizModel("graph.dot");
1797     // ExportGnaNetworkAndrzej("layers/loaded_from_ir", &nnet->obj);
1798 #endif
1799 }
1800 void GNAPlugin::DumpXNNToFile() const {
1801     // TODO: output  precision as well as pointer might be incorrect, LSTM for sure
1802     // gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively
1803     if (!dumpXNNPath.empty()) {
1804         if (!gnadevice) {
1805             THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
1806         }
1807         auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
1808         dump.header.rw_region_size = gnamem->getRWBytes();
1809         dump.header.input_scaling_factor = get_input_scale_factor();
1810         dump.header.output_scaling_factor = output_scale_factor;
1811         std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
1812         dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
1813         dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
1814     }
1815 }
1816
1817 void RotateFeatures(uint8_t *ptr_feat,
1818                     size_t element_size,
1819                     uint32_t num_feature_vectors,
1820                     uint32_t num_feature_vector_elements,
1821                     uint32_t num_rotate_rows,
1822                     uint32_t num_rotate_columns) {
1823     if (num_feature_vector_elements == num_rotate_rows * num_rotate_columns) {
1824         std::vector<uint8_t> temp(num_feature_vector_elements * element_size);
1825         for (uint32_t k = 0; k < num_feature_vectors; k++) {
1826             uint8_t *ptr_in = ptr_feat + k * num_feature_vector_elements * element_size;
1827             for (uint32_t i = 0; i < num_rotate_rows; i++) {
1828                 for (uint32_t j = 0; j < num_rotate_columns; j++) {
1829                     ie_memcpy(&temp.front() + (j * num_rotate_rows + i)*element_size,
1830                               temp.size() - (i * num_rotate_columns + j)*element_size,
1831                               ptr_in + (i * num_rotate_columns + j)*element_size,
1832                               element_size);
1833                 }
1834             }
1835             memcpy(ptr_in, &temp.front(), num_feature_vector_elements * element_size);
1836         }
1837     } else {
1838         THROW_GNA_EXCEPTION << "Rotate dimensions (" << num_rotate_rows << "," << num_rotate_columns
1839                            <<") do not match buffer length of "<< num_feature_vector_elements <<" in RotateFeatures()!";
1840     }
1841 }
1842
1843 uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) {
1844     auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) {
1845         return std::get<1>(item) == -1;
1846     });
1847
1848     if (freeNnet == nnets.end()) {
1849         if (memory_connection.size() != 0) {
1850             Wait(0);
1851             freeNnet = nnets.begin();
1852         } else {
1853             THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
1854                                << "GNA executable network has max of "
1855                                << static_cast<uint32_t >(gna_lib_async_threads_num)
1856                                << " parallel infer requests, please sync one of already running";
1857         }
1858     }
1859
1860
1861     auto nnet = std::get<0>(*freeNnet).get();
1862     auto idx = static_cast<uint32_t>(std::distance(std::begin(nnets), freeNnet));
1863
1864     for (auto &input : inputs) {
1865         auto inputLayout = input.second->layout();
1866         if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
1867             THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: "
1868                                 << input.second->layout();
1869         }
1870         if (inputLayout == NCHW) {
1871             inputLayout = NC;
1872         }
1873         auto is2D = input.second->layout() == Layout::NC || input.second->layout() == Layout::CN;
1874
1875         if (!ptr_inputs_global_id.count(input.first)) {
1876             // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
1877             THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set";
1878         }
1879
1880         if (get_ptr_inputs_global(input.first)[idx] == nullptr) {
1881             // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
1882             THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #"
1883                                 << idx << " not set";
1884         }
1885
1886         if (orientation_in[input.first] == kDnnUnknownOrientation) {
1887             // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
1888             THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set";
1889         }
1890
1891         if (orientation_out == kDnnUnknownOrientation) {
1892             // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
1893             THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
1894         }
1895
1896         auto dims = input.second->dims();
1897
1898         ImportFrames(get_ptr_inputs_global(input.first)[idx],
1899                      input.second->cbuffer().as<float *>(),
1900                      input.second->precision(),
1901                      orientation_in[input.first],
1902                      dims[dims.size() - 1],
1903                      is2D ? dims[1] : dims[dims.size() - 1],
1904                      is2D ? dims[0] : dims[0] * dims[1] * dims[2],
1905                      is2D ? dims[0] : dims[0] * dims[1] * dims[2]);
1906         bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1;
1907         if (((inputLayout == Layout::NC || inputLayout == Layout::NCHW)
1908             != (orientation_in[input.first] == kDnnInterleavedOrientation))
1909             && !isOneChannel) {
1910             RotateFeatures(reinterpret_cast<uint8_t *>(get_ptr_inputs_global(input.first)[idx]),
1911                            gnadevice ? 2 : 4,
1912                            // TODO: only works for cnn4a and google command so far
1913                            dims[dims.size() - 1],
1914                            is2D ? dims[0] : dims[0] * dims[2],  // num_feature_vectors looks batch should be there
1915                            num_rotate_rows,
1916                            num_rotate_columns);
1917         }
1918     }
1919
1920     if (!gnadevice) {
1921         dnn.Propagate();
1922         std::get<1>(*freeNnet) = 1;
1923     } else {
1924         std::get<1>(*freeNnet) = gnadevice->propagate(&nnet->obj, ptr_active_indices, num_active_indices);
1925     }
1926     std::get<2>(*freeNnet) = result;
1927     return idx;
1928 }
1929
1930 void GNAPlugin::Wait(uint32_t idx) {
1931     // already synced TODO: might be copy required ???
1932     if (std::get<1>(nnets[idx]) == -1) return;
1933
1934     if (gnadevice) {
1935         gnadevice->wait(std::get<1>(nnets[idx]));
1936     }
1937
1938     std::get<1>(nnets[idx]) = -1;
1939     auto & result = std::get<2>(nnets[idx]);
1940 #ifdef PLOT
1941     dnn.BeginNewWrite();
1942     if (dnn.num_components() != 0) {
1943         dnn.WriteDnnText("Net_.txt", kDnnFloat);
1944         dnn.WriteInputAndOutputText();
1945     }
1946     dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj);
1947 #endif
1948     if (result.size() != 1) {
1949         THROW_GNA_EXCEPTION << "Invalid number of outputs for infer request: " << result.size() << ",  only 1 supported";
1950     }
1951     auto & output = *result.begin()->second;
1952
1953     if (output.layout() == Layout::NC) {
1954         // TODO: rotate can be incorporated with exporting - used only in unit tests so far
1955         // TODO: restore:
1956 //        if (orientation_out != kDnnInterleavedOrientation) {
1957 //            if (inputs.size() != 1) {
1958 //                THROW_GNA_EXCEPTION << "Invalid number of inputs for  for deinterleave " << inputs.size()
1959 //                                    << ", only 1 supported";
1960 //            }
1961 //            auto dims = inputs.begin()->second->dims();
1962 //            RotateFeatures(reinterpret_cast<uint8_t*>(ptr_outputs_global),
1963 //                           gnadevice ? 2 : 4,
1964 //                           dims[dims.size() - 1],
1965 //                           dims[0],  // num_feature_vectors looks batch should be there
1966 //                           dims[0],
1967 //                           dims[dims.size() - 1]);
1968 //        }
1969         // we concider the last layer as output ...
1970         size_t output_layer_index = std::max(0, static_cast<int>(std::get<0>(nnets[idx])->obj.nLayers - 1));
1971         if (gnadevice && std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].pOutputs != ptr_outputs_global[idx]) {
1972             // ...as this is not true, we should look for output layer index
1973             for (int j = 0; j != std::get<0>(nnets[idx])->obj.nLayers; j++) {
1974                 if (std::get<0>(nnets[idx])->obj.pLayers[j].pOutputs == ptr_outputs_global[idx]) {
1975                     output_layer_index = j;
1976                     break;
1977                 }
1978             }
1979         }
1980
1981         ExportScores(output.buffer(),
1982                      ptr_outputs_global[idx],
1983                      orientation_out,
1984                      output.dims()[output.dims().size() - 1],
1985                      output.dims()[1],
1986                      output.dims()[0],
1987                      output.dims()[0],
1988                      output.dims()[0],
1989                      // TODO: create better getter consider multiple outputs case
1990                      gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].nBytesPerOutput : sizeof(float),
1991                      sizeof(float));
1992     } else if (output.layout() != Layout::CN) {
1993         THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout();
1994     }
1995
1996     if (gnadevice) {
1997 #ifdef PLOT
1998         FILE *f = nullptr;
1999         static int num_infers = 0;
2000         {
2001             f = fopen("ex_scores.txt", "w");
2002         }
2003         num_infers++;
2004         if (f) {
2005             for (int i = 0; i < output.dims()[1]; i++) {
2006                 for (int j = 0; j < output.dims()[0]; j++) {
2007                     fprintf(f, "%d ", output.cbuffer().as<int32_t *>()[output.dims()[0] * i + j]);
2008                 }
2009                 fprintf(f, "\n");
2010             }
2011             fprintf(f, "\n\n");
2012         }
2013 #endif
2014         ConvertToFloat(output.buffer(),
2015                        output.buffer(),
2016                        output.dims()[0],
2017                        output.dims()[1],
2018                        output_scale_factor);
2019 #ifdef PLOT
2020         if (f) {
2021             for (int i = 0; i < output.dims()[1]; i++) {
2022                 for (int j = 0; j < output.dims()[0]; j++) {
2023                     fprintf(f, "%.2f ", output.cbuffer().as<float *>()[output.dims()[0] * i + j]);
2024                 }
2025                 fprintf(f, "\n");
2026             }
2027             fclose(f);
2028         }
2029 #endif
2030     }
2031 }
2032
2033 void GNAPlugin::Reset() {
2034     for (auto && memLayer : memory_connection) {
2035         std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size);
2036     }
2037     for (auto && concatLayer : concat_connection) {
2038         std::memset(concatLayer.second.gna_ptr, 0, concatLayer.second.reserved_size);
2039     }
2040 }
2041
2042 void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
2043     BlobMap bmInput;
2044     BlobMap bmOutput;
2045     if (inputsDataMap.size() != 1) {
2046         THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << "inputs";
2047     }
2048     if (outputsDataMap.size() != 1) {
2049         THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << outputsDataMap.size() << "outputs";
2050     }
2051
2052     bmInput[inputsDataMap.begin()->first] = std::shared_ptr<Blob>(const_cast<Blob*>(&input), [](Blob*){});
2053     bmOutput[outputsDataMap.begin()->first] = std::shared_ptr<Blob>(&output, [](Blob*){});
2054     Infer(bmInput, bmOutput);
2055 }
2056
2057 void GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
2058     Wait(QueueInference(input, result));
2059 }
2060
2061 Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
2062     // need to have intermediate blob for interleave conversion
2063     InferenceEngine::Blob::Ptr outputBlob;
2064     outputBlob = make_blob_with_precision(precision, NC, outputDims);
2065     outputBlob->allocate();
2066     return outputBlob;
2067 }
2068
2069 Blob::Ptr GNAPlugin::GetInputBlob(std::string name, InferenceEngine::Precision precision) {
2070     InferenceEngine::Blob::Ptr inputBlob;
2071     // need to have intermediate blob for interleave conversion
2072     // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
2073     auto inputDims = inputsDataMap[name]->getDims();
2074     inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims);
2075     inputBlob->allocate();
2076     return inputBlob;
2077 }
2078
2079 std::vector<InferenceEngine::MemoryStateInternal::Ptr>  GNAPlugin::QueryState() {
2080     if (memory_connection.empty()) {
2081         return {};
2082     }
2083
2084     return {std::make_shared<GNAMemoryState>(shared_from_this())};
2085 }
2086
2087 InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::string &modelFileName) {
2088     // no need to return anything dueto weird design of internal base classes
2089     std::fstream inputStream(modelFileName, ios_base::in | ios_base::binary);
2090     if (inputStream.fail()) {
2091         THROW_GNA_EXCEPTION << "Cannot open file to import model: " << modelFileName;
2092     }
2093
2094     auto header = GNAModelSerial::ReadHeader(inputStream);
2095
2096     gnadevice.reset(new GNADeviceHelper(gna_proc_type,
2097                                         gna_lib_async_threads_num,
2098                                         gna_openmp_multithreading));
2099     gnamem.reset(new gna_memory_type(make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
2100
2101     void *basePtr = nullptr;
2102     gnamem->reserve_ptr(&basePtr, header.gnaMemSize);
2103     gnamem->commit();
2104
2105     nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(header.layersCount), -1, InferenceEngine::BlobMap()));
2106     std::get<0>(nnets.back())->obj.nGroup = header.nGroup;
2107     GNAModelSerial::MemoryType  mt;
2108     auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt);
2109     serial.Import(basePtr, header.gnaMemSize, inputStream);
2110
2111
2112     get_ptr_inputs_global("input").push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
2113     ptr_outputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.output.descriptor_offset));
2114
2115     auto getOrientation = [](intel_nnet_layer_t & layer) {
2116         return layer.nLayerKind == INTEL_CONVOLUTIONAL ?
2117            kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
2118     };
2119
2120     orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
2121     orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]);
2122
2123     num_bytes_per_output = header.output.element_size;
2124
2125
2126     outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup});
2127     auto inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
2128
2129     inputsDataMap["input"] = std::make_shared<InputInfo>();
2130     inputsDataMap["input"]->setInputData(make_shared<Data>("input",
2131                                                            inputDims,
2132                                                            Precision::FP32,
2133                                                            Layout::NC));
2134     outputsDataMap["output"] = make_shared<Data>("output",
2135                                                  outputDims,
2136                                                  Precision::FP32,
2137                                                  Layout::NC);
2138
2139     output_scale_factor = header.output.scaleFactor;
2140     input_scale_factor["input"] = header.input.scaleFactor;
2141
2142     num_rotate_rows = header.nRotateRows;
2143     num_rotate_columns = header.nRotateColumns;
2144
2145     for (auto && memory : mt) {
2146         GNAMemoryLayer memoryLayer(nullptr, nullptr);
2147         memoryLayer.gna_ptr = memory.first;
2148         memoryLayer.reserved_size = memory.second;
2149
2150         memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer));
2151     }
2152
2153     DumpXNNToFile();
2154
2155 #ifdef PLOT
2156     dnn.WriteGraphWizModel("graph.dot");
2157     // ExportGnaNetworkAndrzej("layers/loaded_from_aot_file", &nnet->obj);
2158 #endif
2159
2160     return nullptr;
2161 }
2162
2163 void GNAPlugin::Export(const std::string &fileName) {
2164     if (ptr_inputs_global_id.empty() || ptr_outputs_global.empty()) {
2165         THROW_GNA_EXCEPTION << " network not loaded";
2166     }
2167
2168     if (ptr_inputs_global_id.size() != 1) {
2169         THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported";
2170     }
2171
2172     std::fstream outStream(fileName, ios_base::out | ios_base::binary);
2173
2174     // TODO: nnet group parameter looks only used in application - so can we move this line into load network.
2175     auto inputDims = inputsDataMap.begin()->second->getDims();
2176     if (inputDims.size() == 2) {
2177         std::get<0>(nnets.front())->obj.nGroup = inputDims[1];
2178     }
2179
2180     auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj,
2181                    {get_input_scale_factor(),
2182                     ptr_inputs_global_storage.front()[0],
2183                     2,
2184                     static_cast<uint32_t>(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))},
2185                    {output_scale_factor,
2186                     ptr_outputs_global[0],
2187                     num_bytes_per_output,
2188                     static_cast<uint32_t>(InferenceEngine::details::product(outputsDataMap.begin()->second->getDims()))})
2189         .SetInputRotation(dnn.num_rotate_rows, dnn.num_rotate_columns);
2190
2191     for (auto && memoryConnection : memory_connection) {
2192         serial.AddState(memoryConnection.second.gna_ptr, memoryConnection.second.reserved_size);
2193     }
2194
2195     serial.Export(gnamem->getBasePtr(), gnamem->getTotalBytes(), outStream);
2196 }
2197
2198 void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) {
2199     if (performance_counting) {
2200         gnadevice->getGnaPerfCounters(perfMap);
2201     }
2202 }
2203
2204 void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
2205
2206 void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {
2207     std::vector<std::string> supportedConfigOptions = {
2208         GNA_CONFIG_KEY(SCALE_FACTOR),
2209         GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE),
2210         GNA_CONFIG_KEY(DEVICE_MODE),
2211         GNA_CONFIG_KEY(COMPACT_MODE),
2212         CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
2213         GNA_CONFIG_KEY(PRECISION),
2214         GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN),
2215         CONFIG_KEY(PERF_COUNT),
2216         GNA_CONFIG_KEY(LIB_N_THREADS),
2217         CONFIG_KEY(SINGLE_THREAD)
2218     };
2219
2220     for (auto& item : config) {
2221         auto keys = std::find_if(supportedConfigOptions.begin(), supportedConfigOptions.end(), [&item](std::string supportedConfigOption) {
2222             return item.first.find(supportedConfigOption) != std::string::npos;
2223         });
2224         if (keys == supportedConfigOptions.end()) {
2225             THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first << " not supported";
2226         }
2227     }
2228
2229     // holds actual value of a found key
2230     std::string key;
2231     std::string value;
2232     auto if_set = [&](std::string keyInput, const std::function<void()> & handler) {
2233         auto keyInMap = config.find(keyInput);
2234         if (keyInMap != config.end()) {
2235             value = keyInMap->second;
2236             handler();
2237         }
2238     };
2239
2240     auto if_start = [&](std::string keyInput, const std::function<void()> & handler) {
2241         for (auto && c : config) {
2242             if (c.first.find(keyInput) == 0) {
2243                 if (c.first.size() > keyInput.size() + 1) {
2244                     key = c.first.substr(keyInput.size() + 1);
2245                     value = c.second;
2246                     handler();
2247                 }
2248             }
2249         }
2250     };
2251
2252     auto fp32eq = [](float p1, float p2) -> bool {
2253         return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
2254     };
2255
2256     auto & log = gnalog();
2257
2258     if_start(GNA_CONFIG_KEY(SCALE_FACTOR), [&, this] {
2259         // only identical scale factors supported so far
2260         auto ref = input_scale_factor.size() ? input_scale_factor.begin()->second : 1.0;
2261         input_scale_factor[key] = std::stod(value);
2262         if (ref != 1.0 && !fp32eq(input_scale_factor[key], ref)) {
2263             std::string message = "only identical input scale factors supported, but provided: "
2264                     + std::to_string(ref) + " and " + std::to_string(input_scale_factor[key]);
2265             log << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key];
2266             THROW_GNA_EXCEPTION << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key];
2267         }
2268     });
2269
2270     if (input_scale_factor.empty()) {
2271         if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
2272             input_scale_factor["placeHolder"] = std::stod(value);
2273         });
2274     }
2275
2276     if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
2277         dumpXNNPath = value;
2278     });
2279
2280     if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
2281         static caseless_unordered_map <std::string, uint32_t> supported_values = {
2282                 {GNAConfigParams::GNA_AUTO, GNA_AUTO},
2283                 {GNAConfigParams::GNA_HW, GNA_HARDWARE},
2284                 {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
2285                 {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
2286         };
2287         auto procType = supported_values.find(value);
2288         if (procType == supported_values.end()) {
2289             log << "GNA device mode unsupported: " << value;
2290             THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
2291         }
2292         gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
2293     });
2294
2295     if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
2296         if (value == PluginConfigParams::YES) {
2297             compact_mode = true;
2298         } else if (value == PluginConfigParams::NO) {
2299             compact_mode = false;
2300         } else {
2301             log << "GNA compact mode should be YES/NO, but not" << value;
2302             THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
2303         }
2304     });
2305
2306     if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
2307         if (value == PluginConfigParams::YES) {
2308             exclusive_async_requests  = true;
2309         } else if (value == PluginConfigParams::NO) {
2310             exclusive_async_requests  = false;
2311         } else {
2312             log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
2313             THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
2314         }
2315     });
2316
2317     if_set(GNA_CONFIG_KEY(PRECISION), [&] {
2318         auto precision = Precision::FromStr(value);
2319         if (precision != Precision::I8 && precision != Precision::I16) {
2320             log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
2321             THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
2322         }
2323         gnaPrecision = precision;
2324     });
2325
2326     if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
2327         if (value == PluginConfigParams::YES) {
2328             uniformPwlDesign = true;
2329         } else if (value == PluginConfigParams::NO) {
2330             uniformPwlDesign = false;
2331         } else {
2332             log << "GNA pwl uniform algorithm parameter "
2333                 << "should be equal to YES/NO, but not" << value;
2334             THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
2335                                 << "should be equal to YES/NO, but not" << value;
2336         }
2337     });
2338
2339     if_set(CONFIG_KEY(PERF_COUNT), [&] {
2340         if (value == PluginConfigParams::YES) {
2341             performance_counting = true;
2342         } else if (value == PluginConfigParams::NO) {
2343             performance_counting = false;
2344         } else {
2345             log << "GNA performance counter enabling parameter "
2346                 << "should be equal to YES/NO, but not" << value;
2347             THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
2348                                 << "should be equal to YES/NO, but not" << value;
2349         }
2350     });
2351
2352     if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
2353         uint64_t lib_threads = std::stoul(value, NULL, 10);
2354         if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
2355             log << "Unsupported accelerator lib number of threads: " << value << ", should be greateer than 0 and less than 127";
2356             THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
2357                                 << ", should be greateer than 0 and less than 127";
2358         }
2359         gna_lib_async_threads_num = lib_threads;
2360     });
2361
2362     if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
2363         if (value == PluginConfigParams::YES) {
2364             gna_openmp_multithreading  = false;
2365         } else if (value == PluginConfigParams::NO) {
2366             gna_openmp_multithreading  = true;
2367         } else {
2368             log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
2369             THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
2370         }
2371     });
2372 }
2373
2374 /**
2375  * @depricated Use the version with config parameter
2376  */
2377 void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
2378                              InferenceEngine::QueryNetworkResult& res) const {
2379     QueryNetwork(network, {}, res);
2380 }
2381
2382 void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
2383                              const std::map<std::string, std::string>& config,
2384                              InferenceEngine::QueryNetworkResult& res) const {
2385     std::unordered_set<CNNLayer *> allLayers;
2386     InferenceEngine::InputsDataMap inputs;
2387
2388     network.getInputsInfo(inputs);
2389     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
2390
2391     if (inputs.empty()) {
2392         THROW_GNA_EXCEPTION << "Network is empty (GNA)\n";
2393     }
2394
2395     auto const & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
2396     if (secondLayers.empty()) {
2397         THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n";
2398     }
2399
2400     InferenceEngine::details::UnorderedDFS(allLayers,
2401                                            secondLayers.begin()->second,
2402                                            [&](CNNLayerPtr const layer) {
2403                                                 if (GNAPluginNS::GNAPlugin::LayerTypeFromStr(layer->type) != NO_TYPE) {
2404                                                     res.supportedLayers.insert(layer->name);
2405                                                 }
2406                                             }, false);
2407     }
2408
2409 intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
2410     if (current->insData.empty()) return nullptr;
2411
2412     auto prev_layer = current->insData.front().lock()->creatorLayer.lock();
2413
2414     return findDnnLayer(prev_layer);
2415 }
2416 void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, void *ptr_inputs, size_t num_data_bytes_out) {
2417     gnalog() << "Connecting output " << layer->name << " ...\n";
2418     // in case of Memory Layer it's input allocated in meminput layer
2419     if (layer->outData.size() == 1) {
2420         for (auto &&outLayer : layer->outData.front()->getInputTo()) {
2421             auto& nextLayer = outLayer.second;
2422             auto nextMemoryLayerIt =
2423                 std::find_if(begin(memory_connection), end(memory_connection),
2424                                                         [&](MemoryConnection::value_type &comp) {
2425                                                             return comp.second.getOutput()->name
2426                                                                                 == nextLayer->name;
2427                                                         });
2428             if (nextMemoryLayerIt != memory_connection.end()) {
2429                 auto &nextMemoryLayer = nextMemoryLayerIt->second;
2430                 // memory layer not yet initialized
2431                 if (nextMemoryLayer.reserved_size == 0) {
2432                     gnamem->reserve_ptr(&nextMemoryLayer.gna_ptr, ALIGN64(num_data_bytes_out));
2433                     gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0);
2434
2435                     nextMemoryLayer.reserved_offset = 0;
2436                     nextMemoryLayer.reserved_size = ALIGN64(num_data_bytes_out);
2437                 } else {
2438                     IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out));
2439                     // same offsets
2440                     gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0);
2441                 }
2442                 return;
2443             }
2444         }
2445
2446         // if one of next layers is concat...
2447         for (auto &&outLayer : layer->outData.front()->getInputTo()) {
2448             auto nextLayer = outLayer.second;
2449             if ( LayerInfo(nextLayer).isConcat() ) {
2450                 auto& name = layer->name;
2451                 // we look for this concat layer pointer in extra concat map
2452                 auto concatLayerInfo = concat_connection.find(
2453                                 nextLayer->name);
2454
2455                 if (concatLayerInfo != concat_connection.end()) {
2456                     auto &concatLayerInfoItem = concatLayerInfo->second;
2457
2458                     // find this input in vector sum all outputs in primitive
2459                     auto it = std::find_if(concatLayerInfoItem.concatInputLayers.begin(),
2460                                             concatLayerInfoItem.concatInputLayers.end(),
2461                                             [&name](GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) {
2462                                                 return item.name == name;
2463                                             });
2464                     // reserve full size for concat
2465                     if (!concatLayerInfoItem.output_allocation_flag) {
2466                         // check if this concat is being included by other one
2467                         // by going thru each concat and checking inputs
2468                         auto included =
2469                             std::find_if(concat_connection.begin(),
2470                                            concat_connection.end(),
2471                                [&concatLayerInfo]
2472                                     (const std::pair<std::string, GNAPlugin::GNAConcatLayer> &concatItem) -> bool {
2473                                         auto it = std::find_if(concatItem.second.concatInputLayers.begin(),
2474                                                         concatItem.second.concatInputLayers.end(),
2475                                                         [&concatLayerInfo]
2476                                                             (const GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) -> bool {
2477                                                                             return item.name == concatLayerInfo->first;
2478                                                             });
2479                                         return it != concatItem.second.concatInputLayers.end();
2480                                     });
2481                         if (included == concat_connection.end()) {
2482                             gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
2483
2484                             for (auto && inputLayer : concatLayerInfoItem.concatInputLayers) {
2485                                 if ( InferenceEngine::details::CaselessEq<std::string>()
2486                                                                     (inputLayer.name, "input") ) {
2487                                     bytes_alllocated_for_input[inputLayer.name] = ALIGN64(concatLayerInfoItem.reserved_size) - inputLayer.offset;
2488                                 }
2489                             }
2490                         }
2491                         concatLayerInfo->second.output_allocation_flag = true;
2492                     }
2493                     gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, it->offset);
2494                 } else {
2495                     // error
2496                 }
2497                 return;
2498             }
2499         }
2500     }
2501
2502     intel_dnn_component_t * unused_input = nullptr;
2503     if (compact_mode) {
2504         unused_input = find_first_unused_input(layer);
2505         if (unused_input != nullptr) {
2506             gnamem->bind_ptr(ptr, &unused_input->ptr_inputs, 0, ALIGN64(num_data_bytes_out));
2507         }
2508     }
2509     // cannot reuse suitable input
2510     if (unused_input == nullptr) {
2511         gnamem->reserve_ptr(ptr, ALIGN64(num_data_bytes_out));
2512     }
2513 }
2514
2515 intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) {
2516     auto component = std::find_if(begin(dnnComponentsForLayer),
2517                         end(dnnComponentsForLayer),
2518                         [&](DnnComponentsForLayer::value_type &comp) {
2519                             return comp.first == __layer->name;
2520                         });
2521     // check for generic prev layer
2522     if (component != dnnComponentsForLayer.end()) {
2523         return &component->second;
2524     }
2525
2526     return nullptr;
2527 }
2528
2529 std::vector<void *>& GNAPlugin::get_ptr_inputs_global(std::string name) {
2530     if (!ptr_inputs_global_id.count(name)) {
2531         ptr_inputs_global_storage.push_front({});
2532         ptr_inputs_global_id[name] = ptr_inputs_global_storage.begin();
2533     }
2534     return *ptr_inputs_global_id[name];
2535 }
2536
2537 GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, int32_t offset, int idx) {
2538     // selecting particular input layers
2539     auto prevLayer = CNNNetPrevLayer(layer, idx);
2540
2541     gnalog() << "Connecting input " << layer->name << " to " << prevLayer->name << " ...\n";
2542
2543     // real input not a memory input
2544     if (LayerInfo(prevLayer).isInput()) {
2545         if (0 == bytes_alllocated_for_input[prevLayer->name]) {
2546             gnamem->push_value(&get_ptr_inputs_global(prevLayer->name).front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
2547             bytes_alllocated_for_input[prevLayer->name] = num_data_bytes_in;
2548         }
2549         if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input[prevLayer->name], 64)) {
2550             THROW_GNA_EXCEPTION
2551                 << "Layer: " << layer->name
2552                 << " Cannot bind pointer to already allocated input(" << prevLayer->name
2553                 << "), due to size_allocated=" << bytes_alllocated_for_input[prevLayer->name]
2554                 << ", and size_requested=" << num_data_bytes_in;
2555         }
2556
2557         if (offset >= 0) {
2558             gnamem->bind_ptr(ptr, &get_ptr_inputs_global(prevLayer->name).front(), offset);
2559         } else {
2560             gnamem->bind_ptr(&get_ptr_inputs_global(prevLayer->name).front(), ptr, -offset);
2561         }
2562
2563         return prevLayer;
2564     }
2565
2566     LayerInfo layerInfoObj(prevLayer);
2567     LayerInfo thisLayerInfoObj(layer);
2568     // connecting to split/slice splitiing layers
2569     if (layerInfoObj.isSplit() || layerInfoObj.isSlice()) {
2570         auto& splittingLayer = prevLayer;
2571         auto& splitName = splittingLayer->name;
2572         auto& name = layer->name;
2573
2574         // we look for this concat layer pointer in extra concat map
2575         auto splitLayerInfo = split_connection.find(splitName);
2576
2577         if (splitLayerInfo != split_connection.end()) {
2578             auto &splitLayerInfoItem = splitLayerInfo->second;
2579             // find this input in vector sum all outputs in primitive
2580             auto it = std::find_if(splitLayerInfoItem.splitOutputLayers.begin(),
2581                                     splitLayerInfoItem.splitOutputLayers.end(),
2582                                             [&name](GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo &item) {
2583                                                 return item.name == name;
2584                                             });
2585
2586             if (it != splitLayerInfoItem.splitOutputLayers.end()) {
2587                 gnalog()  << "Connecting split/slice input \n";
2588                 auto res = connectInput(splittingLayer, ptr,
2589                                             splitLayerInfoItem.reserved_size, it->offset, 0);
2590                 gnalog()  << "Connected \n";
2591                 return res;
2592             }
2593         }
2594         THROW_GNA_EXCEPTION << "Split/Slice layer: " << splitName
2595                                  << " is not included in extra map. Something wrong happened";
2596     } else if (layerInfoObj.isConcat()) {
2597         auto concatLayerInfo = concat_connection.find(
2598                                                     prevLayer->name);
2599         if (concatLayerInfo != concat_connection.end()) {
2600             auto & concatLayerInfoItem = concatLayerInfo->second;
2601             // dnnLayer that is input for concat layer
2602             gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
2603             // return layer over concat
2604             return CNNNetPrevLayer(prevLayer);
2605         }
2606     } else if (layerInfoObj.isCrop()) {
2607         auto cropLayerInfo = crop_connection.find(
2608                                                     prevLayer->name);
2609         if (cropLayerInfo != crop_connection.end()) {
2610             auto & cropLayerInfoItem = cropLayerInfo->second;
2611             gnamem->bind_ptr(ptr, &cropLayerInfoItem.gna_ptr, offset);
2612             return CNNNetPrevLayer(prevLayer);
2613         }
2614     }
2615     auto prevDnnLayer = findDnnLayer(prevLayer);
2616
2617     // check for generic prev layer
2618     if (prevDnnLayer != nullptr) {
2619         gnamem->bind_ptr(ptr, &prevDnnLayer->ptr_outputs, offset);
2620         return prevLayer;
2621     }
2622
2623     auto prevMemoryLayer =
2624         std::find_if(begin(memory_connection), end(memory_connection), [&](MemoryConnection::value_type &comp) {
2625             return comp.second.getInput()->name == prevLayer->name;
2626         });
2627     if (prevMemoryLayer != memory_connection.end()) {
2628         // dnnLayer that is input for memory output layer
2629         auto& memoryLayer = prevMemoryLayer->second;
2630         if (memoryLayer.reserved_size == 0) {
2631             gnamem->reserve_ptr(&memoryLayer.gna_ptr, ALIGN64(num_data_bytes_in));
2632             gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset);
2633
2634             memoryLayer.reserved_offset = offset;
2635             memoryLayer.reserved_size = ALIGN64(num_data_bytes_in);
2636         } else {
2637             IE_ASSERT(memoryLayer.reserved_size == ALIGN64(num_data_bytes_in));
2638             // same offsets
2639             gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, memoryLayer.reserved_offset);
2640         }
2641
2642         return prevLayer;
2643     }
2644
2645     // several layers are to be skipped right now
2646     if (LayerInfo(prevLayer).isReshape()) {
2647         gnalog()  << "Skipping reshape layer: " << prevLayer->name << "\n";
2648         return connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0);
2649     }
2650
2651     if (LayerInfo(prevLayer).isPermute()) {
2652         gnalog()  << "Skipping permute layer: " << prevLayer->name << "\n";
2653         return {connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0).input, true, prevLayer};
2654     }
2655
2656
2657     THROW_GNA_EXCEPTION << "Cannot connect input for: " << layer->name;
2658 }
2659