Publishing 2019 R1 content

[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / gna_plugin.cpp
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp

index 620aa48..fc57d52 100644 (file)
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
  // SPDX-License-Identifier: Apache-2.0
  //
  
@@ -131,7 +131,7 @@ void GNAPlugin::copyInputData(T *dst,
          for (uint32_t i = 0; i < num_frames; i++) {
              for (uint32_t j = 0; j < num_vector_elements; j++) {
                  if (!std::is_same<T, U>::value) {
-                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * input_scale_factor);
+                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * get_input_scale_factor());
                  } else {
                      dst[j * num_group + i] = src[i * num_vector_elements + j];
                  }
@@ -154,7 +154,7 @@ void GNAPlugin::copyInputData(T *dst,
                  U *ptr_src_vec = const_cast<U *>(reinterpret_cast<const U *>(src) + i * num_vector_elements);
                  std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
                  for (int j=0; j < num_vector_elements; j++) {
-                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * input_scale_factor);
+                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * get_input_scale_factor());
                  }
              }
  
@@ -189,9 +189,13 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst,
      for (auto&& outputLayer : splitInfo.splitOutputLayers) {
          uint32_t begin = outputLayer.offset/precision_size;
          uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size;
+        if (dst_ptr - dst >= end) {
+            // output layer with bind pointer as previous one. Skip
+            continue;
+        }
          for (uint32_t i = begin; i < end; ++i) {
              if (!std::is_same<T, U>::value) {
-                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * input_scale_factor);
+                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * get_input_scale_factor());
              } else {
                  *(dst_ptr++) = *(src_ptr++);
              }
@@ -285,46 +289,39 @@ void GNAPlugin::ImportFrames(
                    uint32_t num_group,
                    uint32_t num_vector_elements,
                    uint32_t num_vector_stride) {
-    // special case if split/slice layers connected
-    // with Input detected
-    auto it = split_connection.end();
-    if (split_connection.size() != 0) {
-        it = std::find_if(split_connection.begin(), split_connection.end(), []
-                    (const std::pair<std::string, GNASplitLayer> &item) -> bool {
-                        return CaselessEq<std::string>()(item.second.splitInputLayer.name, "Input");
-                    });
-    }
      if (orientation == kDnnInterleavedOrientation) {
          // TODO : fix that as well
-        if (input_precision.size() == 2) {
+        if (input_precision == Precision::U8) {
+            int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+            uint8_t *src = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(ptr_src));
+            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+        } else if (input_precision.size() == 2) {
              int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
              int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
-            if (it != split_connection.end()) {
-                copyInputDataWithSplit(dst, src, it->second, input_precision.size());
-            } else {
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
-            }
+            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
          } else if (input_precision.size() == 4) {
              if (!gnadevice) {
                  float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
                  float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
-                if (it != split_connection.end()) {
-                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
-                } else {
-                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
-                }
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
              } else {
                  int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
                  const float *src = reinterpret_cast<const float *>(ptr_src);
-                if (it != split_connection.end()) {
-                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
-                } else {
-                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
-                }
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
              }
          }
      } else {
-        if (input_precision.size()== 2) {
+        if (input_precision == Precision::U8) {
+            uint8_t *src = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(ptr_src));
+            if (!gnadevice) {
+                float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            } else {
+                int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            }
+
+        } else if (input_precision.size()== 2) {
              int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
              int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
              copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
@@ -342,9 +339,8 @@ void GNAPlugin::ImportFrames(
      }
  }
  
-void GNAPlugin::fillMemoryConnections(std::map<std::string,
-                                            std::vector<InferenceEngine::CNNLayerPtr>>&
-                                                                            memoryPairs) {
+void GNAPlugin::fillMemoryConnections(std::unordered_map<std::string,
+                                            std::vector<InferenceEngine::CNNLayerPtr>>& memoryPairs) {
      for (auto &memory : memoryPairs) {
          auto inputLayer = memory.second[1];
          auto outputLayer = memory.second[0];
@@ -401,7 +397,7 @@ void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
      LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput);
      for (size_t i = 0; i < layer->outData.size(); ++i) {
          size_t padding = 0;
-        size_t layer_size = 0;
+        size_t output_layer_size = 0;
          auto& dataOutput = layer->outData[i];
  
          if (!dataOutput || !dataInput) {
@@ -416,16 +412,19 @@ void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
  
              padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize())
                                                          * dataOutput->precision.size();
-            layer_size =
+            output_layer_size =
                      InferenceEngine::details::product(begin(dataOutput->dims),
                                                       end(dataOutput->dims)) * dataOutput->precision.size();
  
-            layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, layer_size);
+            if (ptrSplitLayerOutput->type == "AffineFilter") {
+                size_t aligned64_offset = ptrSplitLayerOutput->GetParamAsInt("offset");
+                layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, aligned64_offset, output_layer_size);
+            } else {
+                layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, output_layer_size);
+            }
          }
  
-        split_size += ptrSplitLayerInputLayerInfo.isInput() ?
-                                ALIGN64(padding + layer_size):
-                                        padding + layer_size;
+        split_size += padding + output_layer_size;
      }
      layerInfoItem.reserved_size = split_size;
      layerInfoItem.splitInputLayer =
@@ -717,9 +716,9 @@ void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
      auto &currentComponent = dnnComponentsForLayer.back().second;
      dnn.InitCopyComponent(currentComponent,
                            orientation,
-                          num_rows_in + num_padding_in,
+                          ALIGN(num_rows_in, 8),
                            num_columns_in,
-                          num_rows_out + num_padding_out,
+                          ALIGN(num_rows_out, 8),
                            num_columns_out,
                            inputs->precision.size(),
                            outputs->precision.size(),
@@ -732,7 +731,7 @@ void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
      size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
                                                              begin(outputs->dims), end(outputs->dims)), 8)
                                                                                  * outputs->precision.size();
-    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding_in) * inputs->precision.size();
+    size_t num_data_bytes_in = num_columns_in * ALIGN(num_rows_in, 8) * inputs->precision.size();
  
      connectInput(layer, ptr_inputs, num_data_bytes_in);
      connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
@@ -757,13 +756,23 @@ void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
          THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported";
      }
  
+    auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
      for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) {
          if ( LayerInfo(outLayer.second).isConcat() ) {
-            auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
              connectOutput(layer, &concatLayerInfo.gna_ptr,
                            &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size);
          }
      }
+
+    size_t idx = 0;
+    for (auto && inputLayer : concatLayerInfo.concatInputLayers) {
+        if ( InferenceEngine::details::CaselessEq<std::string>()
+                                            (inputLayer.name, "input") ) {
+            connectInput(layer, &concatLayerInfo.gna_ptr,
+                                concatLayerInfo.reserved_size-inputLayer.offset, static_cast<int32_t>(-inputLayer.offset), idx);
+        }
+        ++idx;
+    }
  }
  
  void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
@@ -780,9 +789,9 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
  
      auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
      size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
-    size_t cropSize = cropLayer->dim.back() * cropLayer->precision.size();
+    size_t cropOutputSize = cropLayer->dim.back() * cropLayer->precision.size();
  
-    if (ALIGN(cropOffset, 8) == cropOffset) {
+    if (ALIGN64(cropOffset) == cropOffset) {
          // leave crop as it is
          GNAPlugin::GNACropLayer cropLayerInfoItem(layer);
          std::string& id = layer->name;
@@ -795,13 +804,13 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
          }
  
          // calculate index idx for connectInput last parameter
-        connectInput(layer, &cropLayerInfo->second.gna_ptr, cropSize + cropOffset, cropOffset, 0);
+        connectInput(layer, &cropLayerInfo->second.gna_ptr, cropOutputSize + cropOffset, cropOffset, 0);
  
          // cases for certain output layers
          for (auto &&outLayer : layer->outData.front()->getInputTo()) {
              auto& nextLayer = outLayer.second;
              if ( LayerInfo(nextLayer).isConcat() ) {
-                connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropSize);
+                connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropOutputSize);
              }
          }
      } else {
@@ -842,30 +851,16 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
                                            begin(outputs->dims), end(outputs->dims)) * 4;
  
          size_t num_data_bytes_in = num_columns_in *
-        (num_rows_in + num_padding) * inputs->precision.size();
+                ALIGN(num_rows_in, 8) * inputs->precision.size();
  
          connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
          connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
  
-        gnamem->readonly().push_initializer(ptr_weights, num_rows_out * (num_rows_in + num_padding)*layer->precision.size(), [=](void * data, size_t size) {
-            int out = 0;
-            for (int input = cropLayer->offset.back(); input < num_rows_out + cropLayer->offset.back(); ++input) {
-                auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * (num_rows_in+num_padding) * layer->precision.size();
-                if (quantized == nullptr) {
-                    auto float_ptr = reinterpret_cast<float *>(mem_ptr);
-                    *float_ptr = 1.0f;
-                } else {
-                    auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
-                    *int_ptr = 1;
-                }
-                ++out;
-            }
-        }, 64);
-        if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
-        } else {
+        FillWeightOfAligningFilter(layer, ptr_weights, cropLayer->offset.back(), (quantized == nullptr) ? false : true);
+
+        (quantized == nullptr) ?
+            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64):
              gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
-        }
      }
  }
  
@@ -907,6 +902,7 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
      uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1);
      uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2);
      uint32_t num_rows_out = num_rows_in;
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
  
      void *ptr_inputs;
      void *ptr_outputs;
@@ -916,9 +912,9 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
      dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
      auto &currentComponent = dnnComponentsForLayer.back().second;
      dnn.InitAffineComponent(currentComponent,
-                            num_rows_in,
+                            num_rows_in + num_padding,
                              num_columns_in,
-                            num_rows_out,
+                            num_rows_out + num_padding,
                              inputs2Bytes->precision.size(),
                              outputs->precision.size(),
                              // TODO: only fp32 and Int16 tested
@@ -936,11 +932,11 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
      cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
  #endif
  
-    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
-        * outputs->precision.size();
+    size_t num_data_bytes_out =
+        InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) * outputs->precision.size();
  
-    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs2Bytes->dims), end(inputs2Bytes->dims))
-        * inputs2Bytes->precision.size();
+    size_t num_data_bytes_in =
+        num_columns_in * (num_rows_in + num_padding) * inputs2Bytes->precision.size();
  
      connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
      connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx);
@@ -955,6 +951,7 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
                  #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
  
                  auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+
                  gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
              }
              connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
@@ -1028,19 +1025,25 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
      auto transpose = false;
      auto transposedRows = 0;
      auto transposedCols = 0;
-    /**
-     * TODO: enable transpose correction between Conv/affine layers implement dedicated pass
-     * TF topologies have inplace permutes so we dont care
-     * kaldi topologies did this internally
-     */
+
      if (0 && connectionInfo.needTransposeWeights) {
-        gnalog() << "Transposing weights for layer: " << layer->name << "\n";
          // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1
          auto permuteOrder = connectionInfo.permute->GetParamAsInts("order");
          if (permuteOrder != vector<int>({0, 3, 2, 1})) {
              THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
                                 ", but only support 0, 3, 2, 1";
          }
+
+        /**
+         * TODO: weights transpose happened after quantisation might result in poor quality for in 8 - move this to passes
+         */
+        if (weightable._weights->precision() == Precision::I8) {
+            THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute operation for 8 bit weights for layer: " << layer->name;
+        }
+
+        // this affine connected to convolution via pool or activation
+        gnalog() << "Transposing weights for layer: " << layer->name << "\n";
+
          transpose = !isDiag;
          transposedRows = connectionInfo.permute->input()->getDims()[3];
          transposedCols = connectionInfo.permute->input()->getDims()[1];
@@ -1053,7 +1056,6 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
                                          weightable._weights->byteSize(),
                                          64);
          } else {
-            // ToDO: write unit tests for transpose
              gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) {
                  for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
                      auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
@@ -1063,13 +1065,16 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
                          for (int i = 0; i < transposedRows; i++) {
                              auto offsetWrite = (transposedRows * j + i) * weightable.precision.size();
                              auto offsetRead = (i * transposedCols + j) * weightable.precision.size();
-                            memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
+                            std::memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
                          }
                      }
                  }
              }, 64);
          }
      } else {
+        if (transpose) {
+            THROW_GNA_EXCEPTION << "transpozed weights with non zero padding not yet supported";
+        }
          auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
          auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
          auto paddedWeightsSize = paddedWeights * weightable.precision.size();
@@ -1094,6 +1099,123 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
      }
  }
  
+void GNAPlugin::FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized) {
+    auto outputs = *layer->outData.begin();
+    auto inputs = layer->insData.begin()->lock();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+
+    if (!ptrWeights) {
+        THROW_GNA_EXCEPTION << "Weights memory is not allocated!!!";
+    }
+
+    gnamem->readonly().push_initializer(ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void * data, size_t size) {
+        int out = 0;
+        for (int input = offset; input < num_rows_out + offset; ++input) {
+            auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * ALIGN(num_rows_in, 8) * layer->precision.size();
+            if (!isQuantized) {
+                auto float_ptr = reinterpret_cast<float *>(mem_ptr);
+                *float_ptr = 1.0f;
+           } else {
+               auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
+               *int_ptr = 1;
+           }
+            ++out;
+        }
+    }, 64);
+}
+
+void GNAPlugin::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto filterLayer = dynamic_cast<InferenceEngine::WeightableLayer *> (layer.get());
+
+    if (filterLayer == nullptr) {
+        return;
+    }
+
+    std::string& name = filterLayer->name;
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    // we look for this concat layer pointer in extra concat map
+    auto prevLayer = CNNNetPrevLayer(layer.get(), 0);
+    if (!LayerInfo(prevLayer).isSplit() && !LayerInfo(prevLayer).isSlice()) {
+        THROW_GNA_EXCEPTION << "Case  with Affine Aligning Filter for not Split/Slice layers is not implemented yet!";
+    }
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    auto outputs = *layer->outData.begin();
+    auto inputs = layer->insData.begin()->lock();
+
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
+
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+    gnalog() << "Filter " << layer->name << " is being inserted...\n";
+    auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->precision() : outputs->precision;
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in + num_padding,
+                            num_columns_in,
+                            num_rows_out,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            filterLayer->_weights->precision().size(),
+                            biasPrecision.size(),
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            false);
+
+    size_t num_data_bytes_out =
+                InferenceEngine::details::product(
+                                        begin(outputs->dims), end(outputs->dims)) * 4;
+
+    size_t num_data_bytes_in = num_columns_in *
+                            ALIGN(num_rows_in, 8) * inputs->precision.size();
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    if (num_padding == 0) {
+        gnamem->readonly().push_ptr(ptr_weights,
+                                filterLayer->_weights->cbuffer().as<const void *>(),
+                                filterLayer->_weights->byteSize(),
+                                                            64);
+    } else {
+        auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
+        auto paddedWeights = elementsIn * num_rows_out;
+        auto paddedWeightsSize = paddedWeights * filterLayer->precision.size();
+
+        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
+            for (int i = 0; i < num_rows_out; i++) {
+                std::memcpy(data,
+                       filterLayer->_weights->cbuffer().as<const uint8_t *>() + num_rows_in * i * filterLayer->precision.size(),
+                       num_rows_in * filterLayer->precision.size());
+                data = reinterpret_cast<uint8_t *>(data) + (num_rows_in + num_padding) * filterLayer->precision.size();
+            }
+        }, 64);
+    }
+
+    if (filterLayer->_biases) {
+        gnamem->readonly().push_ptr(ptr_biases,
+                         filterLayer->_biases->cbuffer().as<const void *>(),
+                         filterLayer->_biases->byteSize(),
+                         64);
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
  void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
      auto *generic = dynamic_cast<GenericLayer *>(layer.get());
      std::string type;
@@ -1269,6 +1391,7 @@ void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
          {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}},  // skip input layers they are not used in GNA lib, only as a memory blobs
          {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)},
          {{"ScaleShift"}, CREATE(DiagonalPrimitive)},
+        {{"AffineFilter"}, CREATE(AffineFilterPrimitive)},
          {{"Eltwise"},
           CREATE(EltwisePrimitive)},  // same as diagonal while weights are not taken from network, rather than from another output
          {{"Split"}, SKIP},  // skip information about which part of prev layer need to consume handle during layer creation
@@ -1293,109 +1416,10 @@ void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
  
  
  GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
-    // holds actual value of a found key
-    std::string value;
-    auto if_set = [&](std::string key, const std::function<void()> & handler) {
-        auto keyInMap = configMap.find(key);
-        if (keyInMap != configMap.end()) {
-            value = keyInMap->second;
-            handler();
-        }
-    };
-
-    if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
-        input_scale_factor = std::stod(value);
-    });
-
-    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
-        dumpXNNPath = value;
-    });
-
-    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
-        static caseless_unordered_map <std::string, uint32_t> supported_values = {
-            {GNAConfigParams::GNA_AUTO, GNA_AUTO},
-            {GNAConfigParams::GNA_HW, GNA_HARDWARE},
-            {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
-            {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
-        };
-        auto procType = supported_values.find(value);
-        if (procType == supported_values.end()) {
-            THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
-        }
-        gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
-    });
-
-    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
-        if (value == PluginConfigParams::YES) {
-            compact_mode = true;
-        } else if (value == PluginConfigParams::NO) {
-            compact_mode = false;
-        } else {
-            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
-        }
-    });
-
-    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
-        if (value == PluginConfigParams::YES) {
-            exclusive_async_requests  = true;
-        } else if (value == PluginConfigParams::NO) {
-            exclusive_async_requests  = false;
-        } else {
-            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
-        auto precision = Precision::FromStr(value);
-        if (precision != Precision::I8 && precision != Precision::I16) {
-            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
-        }
-        gnaPrecision = precision;
-    });
-
-    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
-        if (value == PluginConfigParams::YES) {
-            uniformPwlDesign = true;
-        } else if (value == PluginConfigParams::NO) {
-            uniformPwlDesign = false;
-        } else {
-            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
-                                                            << "should be equal to YES/NO, but not" << value;
-        }
-    });
-
-    if_set(CONFIG_KEY(PERF_COUNT), [&] {
-        if (value == PluginConfigParams::YES) {
-            performance_counting = true;
-        } else if (value == PluginConfigParams::NO) {
-            performance_counting = false;
-        } else {
-            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
-                                                            << "should be equal to YES/NO, but not" << value;
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
-        uint64_t lib_threads = std::stoul(value, NULL, 10);
-        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
-            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
-                                                            << ", should be greateer than 0 and less than 127";
-        }
-        gna_lib_async_threads_num = lib_threads;
-    });
-
-    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
-        if (value == PluginConfigParams::YES) {
-            gna_openmp_multithreading  = false;
-        } else if (value == PluginConfigParams::NO) {
-            gna_openmp_multithreading  = true;
-        } else {
-            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-        }
-    });
+    SetConfig(configMap);
  }
  
-GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) {
+GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) const {
      static const caseless_map<std::string, GNAPlugin::LayerType> LayerNameToType = {
          { "Input" , Input },
          { "Convolution" , Convolution },
@@ -1433,13 +1457,14 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage
      auto network_precision = network.getPrecision();
      network.getInputsInfo(inputs);
      auto network_input_precision = inputs.begin()->second->getInputPrecision();
-    auto batch_sise = network.getBatchSize();
+    auto batch_size = network.getBatchSize();
      if (network_precision != Precision::FP32) {
          errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n";
          return false;
      }
      if (network_input_precision != Precision::FP32 &&
-        network_input_precision != Precision::I16) {
+        network_input_precision != Precision::I16 &&
+        network_input_precision != Precision::U8) {
          errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n";
          return false;
      }
@@ -1469,7 +1494,9 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage
                                                      errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n";
                                                      check_result =  false;
                                                  }
-                                                if (batch_sise != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
+                                                if (batch_size != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
+                                                    errMessage = "topology with layer: " + layer->name + ", type: " + layer->type +
+                                                                 ", and batch size(" + to_string(batch_size) + ") != 1 not supported";
                                                      check_result =  false;
                                                  }
                                              }, false);
@@ -1477,6 +1504,10 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage
      return check_result;
  }
  
+float GNAPlugin::get_input_scale_factor() const {
+    return input_scale_factor.empty() ? 1.0 : input_scale_factor.begin()->second;
+}
+
  void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
      //  Check the input network
      std::string error;
@@ -1490,21 +1521,34 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
          substitutePRelu(layers);
          layers = CNNNetSortTopologically(*network.get());
          reorderMaxPool(layers);
-        applyOrientations(layers);
+        //  ToDo sort if bool flag "changed"
+        //  returned from insertion function
+        insertAligningFilterLayer(layers);
+
+#if ENABLE_AUTO_PERMUTE
+        layers = CNNNetSortTopologically(*network.get());
+        reversePermutations(layers);
+#endif
+        layers = CNNNetSortTopologically(*network.get());
          insertIdentityLayer(layers);
+        layers = CNNNetSortTopologically(*network.get());
+        insertCopyLayer(layers);
+        layers = CNNNetSortTopologically(*network.get());
          insertDiagonalLayer(layers);
+        layers = CNNNetSortTopologically(*network.get());
+        substituteScaleShiftBroadCast(layers);
      };
  
      Config supported = Config({
          {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr {
              if (gnaPrecision == Precision::I16) {
                  ModelQuantizer<QuantI16> q;
-                return q.quantize(network, run_passes, input_scale_factor);
+                return q.quantize(network, run_passes, get_input_scale_factor());
              }
  
              if (gnaPrecision == Precision::I8) {
                  ModelQuantizer<QuantI8> q;
-                return q.quantize(network, run_passes, input_scale_factor);
+                return q.quantize(network, run_passes, get_input_scale_factor());
              }
              THROW_GNA_EXCEPTION << "no mans land for GNA precision";
          }},
@@ -1529,24 +1573,13 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
  
      supported.setDefaultDevice(TargetDevice::eGNA);
      auto newNet = supported.find_configuration(network).convert(network);
-    auto networkPrecision = newNet->getPrecision();
  
-    if (!networkPrecision.is_float()) {
-        gnadevice.reset(new GNADeviceHelper(gna_proc_type,
-                                            gna_lib_async_threads_num,
-                                            gna_openmp_multithreading,
-                                            performance_counting));
-        gnamem.reset(new gna_memory_type(
-                    make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
-    } else {
-        gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
-    }
+
  
      // creating intel dnn_t structures from network
      auto sortedNet = CNNNetSortTopologically(*newNet);
      std::vector<CNNLayerPtr> sortedNoMem;
-    std::map<std::string,
-                    std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
+    std::unordered_map<std::string, std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
      // find all memory layers pairs and mark which one used as outputs
      for (auto &layer : sortedNet) {
          auto generic = dynamic_cast<GenericLayer *>(layer.get());
@@ -1572,16 +1605,28 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
      // fill in extra storage with memory layers
      fillMemoryConnections(memoryPairs);
  
+    if (memory_connection.size() != 0) {
+        gna_lib_async_threads_num = 1;
+    }
+
+    auto networkPrecision = newNet->getPrecision();
+
+    if (!networkPrecision.is_float()) {
+        gnadevice.reset(new GNADeviceHelper(gna_proc_type,
+                                            gna_lib_async_threads_num,
+                                            gna_openmp_multithreading,
+                                            performance_counting));
+        gnamem.reset(new gna_memory_type(
+                make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
+    } else {
+        gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
+    }
+
      // keep inputs information and create input primitives
      newNet->getInputsInfo(inputsDataMap);
      if (inputsDataMap.empty()) {
          THROW_GNA_EXCEPTION << " No inputs for the topology";
      }
-    if (inputsDataMap.size() != 1) {
-        THROW_GNA_EXCEPTION << " cannot infer topologies with more than one inputs";
-    }
-
-    inputDims = inputsDataMap.begin()->second->getDims();
  
      // keep output dims
      newNet->getOutputsInfo(outputsDataMap);
@@ -1593,7 +1638,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
      }
      outputDims = outputsDataMap.begin()->second->dims;
  
-    ptr_inputs_global.resize(gna_lib_async_threads_num);
+    for (auto && input : inputsDataMap) {
+        get_ptr_inputs_global(input.first).resize(gna_lib_async_threads_num);
+    }
+
      ptr_outputs_global.resize(gna_lib_async_threads_num);
      // CreatingLayer primitives
      // TODO: solely gna_example convolution hack
@@ -1601,11 +1649,25 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
      for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) {
          CreateLayerPrimitive(*layer);
      }
-    gnamem->bind_ptr(&ptr_outputs_global.front(), &dnnComponentsForLayer.back().second.ptr_outputs);
+    DnnComponentsForLayer::iterator output_component = std::find_if(dnnComponentsForLayer.begin(),
+                                                        dnnComponentsForLayer.end(),
+                                                        [&](const std::pair<std::string, intel_dnn_component_t>& v)
+                                                        { return outputsDataMap.begin()->first == v.first; });
+
+    if (output_component == dnnComponentsForLayer.end()) {
+        if (dnnComponentsForLayer.empty()) {
+            THROW_GNA_EXCEPTION << "No outputs found in internal structures";
+        }
+        // likely layer is fused. Take last one
+        output_component = std::prev(dnnComponentsForLayer.end());
+        gnalog() << "Output layer "<< outputsDataMap.begin()->first
+                    << " has not been found in component list. Took  "
+                    << output_component->first << " instead \n" << std::flush;
+    }
+    gnamem->bind_ptr(&ptr_outputs_global.front(), &output_component->second.ptr_outputs);
  
      // make room for active list
-    auto &last_component = dnnComponentsForLayer.back().second;
-    gnamem->reserve_ptr(nullptr, ALIGN64(last_component.num_bytes_per_output * last_component.num_rows_out));
+    gnamem->reserve_ptr(nullptr, ALIGN64(output_component->second.num_bytes_per_output * output_component->second.num_rows_out));
  
      void *pParallelExecutionData  = nullptr;
  
@@ -1630,16 +1692,16 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
      // in fp32 mode last PWL cannot be computed without that
      dnn.InitActiveList(NULL);
  
-    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap()));
  
      if (!networkPrecision.is_float()) {
          // number of layer gets calculated inside that InitGNAStruct function
          dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj);
      }
  
-    // creating same gna RW segment for paralle infer requests
+    // creating same gna RW segment for parallel infer requests
      for (int i = 1; i != gna_lib_async_threads_num; i++) {
-        nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+        nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap()));
  
          // this can be improved by just copy all structures, but we are too lazy
          dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj);
@@ -1656,7 +1718,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
              }
          };
  
-        relocate(ptr_inputs_global[i], ptr_inputs_global[0]);
+        for (auto &&input : ptr_inputs_global_storage) {
+            relocate(input[i], input[0]);
+        }
+
          relocate(ptr_outputs_global[i], ptr_outputs_global[0]);
          for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) {
              auto & layer = std::get<0>(nnets[i])->obj.pLayers[j];
@@ -1666,11 +1731,60 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
              relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate);
          }
      }
-    orientation_in = dnn.component[0].orientation_in;
-    orientation_out = dnn.component[dnn.num_components()-1].orientation_out;
-    num_bytes_per_output = dnn.component[dnn.num_components()-1].num_bytes_per_output;
  
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(sortedNoMem.back());
+    // calculating input orientation without memory layers, since their orientation not changed during infer right now
+    std::unordered_map<string, string> skippedLayers;
+    for (auto &layer : sortedNet) {
+        for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) {
+            auto prevLayer = CNNNetPrevLayer(layer.get(), i);
+            if (!skippedLayers.count(prevLayer->name)) {
+                if (CNNNetHasPrevLayer(prevLayer.get())) {
+                    continue;
+                }
+
+                // we are in the one of input layers
+                if (LayerInfo(prevLayer).isMemory()) {
+                    continue;
+                }
+            }
+
+            auto dnnLayer = findDnnLayer(layer);
+            string inputName = prevLayer->name;
+            if (skippedLayers.count(prevLayer->name)) {
+                inputName = skippedLayers[prevLayer->name];
+            }
+
+            // non functional layer - skipped by gna
+            if (nullptr == dnnLayer) {
+                // storing input name for skipped layer
+                skippedLayers[layer->name] = inputName;
+                continue;
+            }
+
+            // input orientation might be already initialized, thus verify that it matches
+            if (!orientation_in.count(inputName)) {
+                orientation_in[inputName] = dnnLayer->orientation_in;
+            } else {
+                if (orientation_in[inputName] != dnnLayer->orientation_in) {
+                    THROW_GNA_EXCEPTION << "orientation for input layer: " << inputName << "cannot be calculated";
+                }
+            }
+        }
+    }
+
+    orientation_out = output_component->second.orientation_out;
+    num_bytes_per_output = output_component->second.num_bytes_per_output;
+
+    // find output layer
+    auto output = std::find_if(sortedNet.begin(),
+                                sortedNet.end(),
+                                [&](const CNNLayerPtr& v)
+                                { return outputsDataMap.begin()->first == v.get()->name; });
+    if (output == sortedNet.end()) {
+        // likely layer is fused. Take last one
+        output = std::prev(sortedNet.end());
+    }
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(*output);
      output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
  
      num_rotate_rows = dnn.num_rotate_rows;
@@ -1692,7 +1806,7 @@ void GNAPlugin::DumpXNNToFile() const {
          }
          auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
          dump.header.rw_region_size = gnamem->getRWBytes();
-        dump.header.input_scaling_factor = input_scale_factor;
+        dump.header.input_scaling_factor = get_input_scale_factor();
          dump.header.output_scaling_factor = output_scale_factor;
          std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
          dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
@@ -1726,69 +1840,81 @@ void RotateFeatures(uint8_t *ptr_feat,
      }
  }
  
-uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
-    return QueueInference(*input.begin()->second.get(), result);
-
-    /*if (!syncPoints.empty()) {
-        syncPoints.back().second = result;
-    }*/
-}
-
-uint32_t GNAPlugin::QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result) {
-    auto inputLayout = input.layout();
-    if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
-        THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " << input.layout();
-    }
-    if (inputLayout == NCHW) {
-        inputLayout = NC;
-    }
-    auto is2D = input.layout() ==  Layout::NC || input.layout() == Layout ::CN;
-
+uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) {
      auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) {
          return std::get<1>(item) == -1;
      });
  
      if (freeNnet == nnets.end()) {
-        THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
-                           << "GNA executable network has max of " << static_cast<uint32_t >(gna_lib_async_threads_num)
-                           << " parallel infer requests, please sync one of already running";
+        if (memory_connection.size() != 0) {
+            Wait(0);
+            freeNnet = nnets.begin();
+        } else {
+            THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
+                               << "GNA executable network has max of "
+                               << static_cast<uint32_t >(gna_lib_async_threads_num)
+                               << " parallel infer requests, please sync one of already running";
+        }
      }
  
+
      auto nnet = std::get<0>(*freeNnet).get();
      auto idx = static_cast<uint32_t>(std::distance(std::begin(nnets), freeNnet));
  
-    if (ptr_inputs_global[idx] == nullptr) {
-        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-        THROW_GNA_EXCEPTION << "network not loaded : global input pointer not set";
-    }
+    for (auto &input : inputs) {
+        auto inputLayout = input.second->layout();
+        if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
+            THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: "
+                                << input.second->layout();
+        }
+        if (inputLayout == NCHW) {
+            inputLayout = NC;
+        }
+        auto is2D = input.second->layout() == Layout::NC || input.second->layout() == Layout::CN;
  
-    if (orientation_in == kDnnUnknownOrientation) {
-        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-        THROW_GNA_EXCEPTION << "network not loaded : input orientation not set";
-    }
+        if (!ptr_inputs_global_id.count(input.first)) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set";
+        }
  
-    if (orientation_out == kDnnUnknownOrientation) {
-        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-        THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
-    }
+        if (get_ptr_inputs_global(input.first)[idx] == nullptr) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #"
+                                << idx << " not set";
+        }
  
-    ImportFrames(ptr_inputs_global[idx],
-                 input.cbuffer().as<float *>(),
-                 input.precision(),
-                 orientation_in,
-                 input.dims()[input.dims().size() - 1],
-                 is2D ? input.dims()[1] : input.dims()[input.dims().size() - 1],
-                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],
-                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2]);
+        if (orientation_in[input.first] == kDnnUnknownOrientation) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set";
+        }
+
+        if (orientation_out == kDnnUnknownOrientation) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
+        }
  
-    if ((inputLayout == Layout::NC || inputLayout == Layout::NCHW) != (orientation_in == kDnnInterleavedOrientation)) {
-        RotateFeatures(reinterpret_cast<uint8_t*>(ptr_inputs_global[idx]),
-                       gnadevice ? 2 : 4,
-                       // TODO: only works for cnn4a and google command so far
-                       input.dims()[input.dims().size() - 1],
-                       is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],  // num_feature_vectors looks batch should be there
-                       num_rotate_rows,
-                       num_rotate_columns);
+        auto dims = input.second->dims();
+
+        ImportFrames(get_ptr_inputs_global(input.first)[idx],
+                     input.second->cbuffer().as<float *>(),
+                     input.second->precision(),
+                     orientation_in[input.first],
+                     dims[dims.size() - 1],
+                     is2D ? dims[1] : dims[dims.size() - 1],
+                     is2D ? dims[0] : dims[0] * dims[1] * dims[2],
+                     is2D ? dims[0] : dims[0] * dims[1] * dims[2]);
+        bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1;
+        if (((inputLayout == Layout::NC || inputLayout == Layout::NCHW)
+            != (orientation_in[input.first] == kDnnInterleavedOrientation))
+            && !isOneChannel) {
+            RotateFeatures(reinterpret_cast<uint8_t *>(get_ptr_inputs_global(input.first)[idx]),
+                           gnadevice ? 2 : 4,
+                           // TODO: only works for cnn4a and google command so far
+                           dims[dims.size() - 1],
+                           is2D ? dims[0] : dims[0] * dims[2],  // num_feature_vectors looks batch should be there
+                           num_rotate_rows,
+                           num_rotate_columns);
+        }
      }
  
      if (!gnadevice) {
@@ -1810,7 +1936,7 @@ void GNAPlugin::Wait(uint32_t idx) {
      }
  
      std::get<1>(nnets[idx]) = -1;
-    auto & output = *std::get<2>(nnets[idx]).begin()->second;
+    auto & result = std::get<2>(nnets[idx]);
  #ifdef PLOT
      dnn.BeginNewWrite();
      if (dnn.num_components() != 0) {
@@ -1819,18 +1945,38 @@ void GNAPlugin::Wait(uint32_t idx) {
      }
      dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj);
  #endif
+    if (result.size() != 1) {
+        THROW_GNA_EXCEPTION << "Invalid number of outputs for infer request: " << result.size() << ",  only 1 supported";
+    }
+    auto & output = *result.begin()->second;
  
      if (output.layout() == Layout::NC) {
          // TODO: rotate can be incorporated with exporting - used only in unit tests so far
          // TODO: restore:
  //        if (orientation_out != kDnnInterleavedOrientation) {
+//            if (inputs.size() != 1) {
+//                THROW_GNA_EXCEPTION << "Invalid number of inputs for  for deinterleave " << inputs.size()
+//                                    << ", only 1 supported";
+//            }
+//            auto dims = inputs.begin()->second->dims();
  //            RotateFeatures(reinterpret_cast<uint8_t*>(ptr_outputs_global),
  //                           gnadevice ? 2 : 4,
-//                           input.dims()[input.dims().size() - 1],
-//                           input.dims()[0],  // num_feature_vectors looks batch should be there
-//                           input.dims()[0],
-//                           input.dims()[input.dims().size() - 1]);
+//                           dims[dims.size() - 1],
+//                           dims[0],  // num_feature_vectors looks batch should be there
+//                           dims[0],
+//                           dims[dims.size() - 1]);
  //        }
+        // we concider the last layer as output ...
+        size_t output_layer_index = std::max(0, static_cast<int>(std::get<0>(nnets[idx])->obj.nLayers - 1));
+        if (gnadevice && std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].pOutputs != ptr_outputs_global[idx]) {
+            // ...as this is not true, we should look for output layer index
+            for (int j = 0; j != std::get<0>(nnets[idx])->obj.nLayers; j++) {
+                if (std::get<0>(nnets[idx])->obj.pLayers[j].pOutputs == ptr_outputs_global[idx]) {
+                    output_layer_index = j;
+                    break;
+                }
+            }
+        }
  
          ExportScores(output.buffer(),
                       ptr_outputs_global[idx],
@@ -1841,7 +1987,7 @@ void GNAPlugin::Wait(uint32_t idx) {
                       output.dims()[0],
                       output.dims()[0],
                       // TODO: create better getter consider multiple outputs case
-                     gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[std::get<0>(nnets[idx])->obj.nLayers - 1].nBytesPerOutput : sizeof(float),
+                     gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].nBytesPerOutput : sizeof(float),
                       sizeof(float));
      } else if (output.layout() != Layout::CN) {
          THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout();
@@ -1884,13 +2030,6 @@ void GNAPlugin::Wait(uint32_t idx) {
      }
  }
  
-
-void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
-    BlobMap result;
-    result["output"] = std::shared_ptr<Blob>(&output, [](Blob*){});
-    Wait(QueueInference(input, result));
-}
-
  void GNAPlugin::Reset() {
      for (auto && memLayer : memory_connection) {
          std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size);
@@ -1900,10 +2039,23 @@ void GNAPlugin::Reset() {
      }
  }
  
-void GNAPlugin::Infer(const BlobMap &inputs, BlobMap &result) {
-    auto &input = *inputs.begin()->second.get();
-    auto &output = *result.begin()->second.get();
-    Infer(input, output);
+void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
+    BlobMap bmInput;
+    BlobMap bmOutput;
+    if (inputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << "inputs";
+    }
+    if (outputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << outputsDataMap.size() << "outputs";
+    }
+
+    bmInput[inputsDataMap.begin()->first] = std::shared_ptr<Blob>(const_cast<Blob*>(&input), [](Blob*){});
+    bmOutput[outputsDataMap.begin()->first] = std::shared_ptr<Blob>(&output, [](Blob*){});
+    Infer(bmInput, bmOutput);
+}
+
+void GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
+    Wait(QueueInference(input, result));
  }
  
  Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
@@ -1914,10 +2066,11 @@ Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
      return outputBlob;
  }
  
-Blob::Ptr GNAPlugin::GetInputBlob(InferenceEngine::Precision precision) {
+Blob::Ptr GNAPlugin::GetInputBlob(std::string name, InferenceEngine::Precision precision) {
      InferenceEngine::Blob::Ptr inputBlob;
      // need to have intermediate blob for interleave conversion
      // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
+    auto inputDims = inputsDataMap[name]->getDims();
      inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims);
      inputBlob->allocate();
      return inputBlob;
@@ -1955,7 +2108,8 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
      auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt);
      serial.Import(basePtr, header.gnaMemSize, inputStream);
  
-    ptr_inputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
+
+    get_ptr_inputs_global("input").push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
      ptr_outputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.output.descriptor_offset));
  
      auto getOrientation = [](intel_nnet_layer_t & layer) {
@@ -1963,14 +2117,14 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
             kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
      };
  
-    orientation_in = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
+    orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
      orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]);
  
      num_bytes_per_output = header.output.element_size;
  
  
      outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup});
-    inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
+    auto inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
  
      inputsDataMap["input"] = std::make_shared<InputInfo>();
      inputsDataMap["input"]->setInputData(make_shared<Data>("input",
@@ -1983,7 +2137,7 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
                                                   Layout::NC);
  
      output_scale_factor = header.output.scaleFactor;
-    input_scale_factor = header.input.scaleFactor;
+    input_scale_factor["input"] = header.input.scaleFactor;
  
      num_rotate_rows = header.nRotateRows;
      num_rotate_columns = header.nRotateColumns;
@@ -2007,20 +2161,25 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
  }
  
  void GNAPlugin::Export(const std::string &fileName) {
-    if (ptr_inputs_global.empty() || ptr_outputs_global.empty()) {
+    if (ptr_inputs_global_id.empty() || ptr_outputs_global.empty()) {
          THROW_GNA_EXCEPTION << " network not loaded";
      }
  
+    if (ptr_inputs_global_id.size() != 1) {
+        THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported";
+    }
+
      std::fstream outStream(fileName, ios_base::out | ios_base::binary);
  
      // TODO: nnet group parameter looks only used in application - so can we move this line into load network.
+    auto inputDims = inputsDataMap.begin()->second->getDims();
      if (inputDims.size() == 2) {
          std::get<0>(nnets.front())->obj.nGroup = inputDims[1];
      }
  
      auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj,
-                   {input_scale_factor,
-                    ptr_inputs_global[0],
+                   {get_input_scale_factor(),
+                    ptr_inputs_global_storage.front()[0],
                      2,
                      static_cast<uint32_t>(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))},
                     {output_scale_factor,
@@ -2043,7 +2202,209 @@ void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::Infe
  }
  
  void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
-void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {}
+
+void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {
+    std::vector<std::string> supportedConfigOptions = {
+        GNA_CONFIG_KEY(SCALE_FACTOR),
+        GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE),
+        GNA_CONFIG_KEY(DEVICE_MODE),
+        GNA_CONFIG_KEY(COMPACT_MODE),
+        CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
+        GNA_CONFIG_KEY(PRECISION),
+        GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN),
+        CONFIG_KEY(PERF_COUNT),
+        GNA_CONFIG_KEY(LIB_N_THREADS),
+        CONFIG_KEY(SINGLE_THREAD)
+    };
+
+    for (auto& item : config) {
+        auto keys = std::find_if(supportedConfigOptions.begin(), supportedConfigOptions.end(), [&item](std::string supportedConfigOption) {
+            return item.first.find(supportedConfigOption) != std::string::npos;
+        });
+        if (keys == supportedConfigOptions.end()) {
+            THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first << " not supported";
+        }
+    }
+
+    // holds actual value of a found key
+    std::string key;
+    std::string value;
+    auto if_set = [&](std::string keyInput, const std::function<void()> & handler) {
+        auto keyInMap = config.find(keyInput);
+        if (keyInMap != config.end()) {
+            value = keyInMap->second;
+            handler();
+        }
+    };
+
+    auto if_start = [&](std::string keyInput, const std::function<void()> & handler) {
+        for (auto && c : config) {
+            if (c.first.find(keyInput) == 0) {
+                if (c.first.size() > keyInput.size() + 1) {
+                    key = c.first.substr(keyInput.size() + 1);
+                    value = c.second;
+                    handler();
+                }
+            }
+        }
+    };
+
+    auto fp32eq = [](float p1, float p2) -> bool {
+        return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+    };
+
+    auto & log = gnalog();
+
+    if_start(GNA_CONFIG_KEY(SCALE_FACTOR), [&, this] {
+        // only identical scale factors supported so far
+        auto ref = input_scale_factor.size() ? input_scale_factor.begin()->second : 1.0;
+        input_scale_factor[key] = std::stod(value);
+        if (ref != 1.0 && !fp32eq(input_scale_factor[key], ref)) {
+            std::string message = "only identical input scale factors supported, but provided: "
+                    + std::to_string(ref) + " and " + std::to_string(input_scale_factor[key]);
+            log << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key];
+            THROW_GNA_EXCEPTION << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key];
+        }
+    });
+
+    if (input_scale_factor.empty()) {
+        if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
+            input_scale_factor["placeHolder"] = std::stod(value);
+        });
+    }
+
+    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
+        dumpXNNPath = value;
+    });
+
+    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
+        static caseless_unordered_map <std::string, uint32_t> supported_values = {
+                {GNAConfigParams::GNA_AUTO, GNA_AUTO},
+                {GNAConfigParams::GNA_HW, GNA_HARDWARE},
+                {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
+                {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
+        };
+        auto procType = supported_values.find(value);
+        if (procType == supported_values.end()) {
+            log << "GNA device mode unsupported: " << value;
+            THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
+        }
+        gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
+    });
+
+    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
+        if (value == PluginConfigParams::YES) {
+            compact_mode = true;
+        } else if (value == PluginConfigParams::NO) {
+            compact_mode = false;
+        } else {
+            log << "GNA compact mode should be YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
+        if (value == PluginConfigParams::YES) {
+            exclusive_async_requests  = true;
+        } else if (value == PluginConfigParams::NO) {
+            exclusive_async_requests  = false;
+        } else {
+            log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
+        auto precision = Precision::FromStr(value);
+        if (precision != Precision::I8 && precision != Precision::I16) {
+            log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+        }
+        gnaPrecision = precision;
+    });
+
+    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
+        if (value == PluginConfigParams::YES) {
+            uniformPwlDesign = true;
+        } else if (value == PluginConfigParams::NO) {
+            uniformPwlDesign = false;
+        } else {
+            log << "GNA pwl uniform algorithm parameter "
+                << "should be equal to YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
+                                << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(PERF_COUNT), [&] {
+        if (value == PluginConfigParams::YES) {
+            performance_counting = true;
+        } else if (value == PluginConfigParams::NO) {
+            performance_counting = false;
+        } else {
+            log << "GNA performance counter enabling parameter "
+                << "should be equal to YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
+                                << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
+        uint64_t lib_threads = std::stoul(value, NULL, 10);
+        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
+            log << "Unsupported accelerator lib number of threads: " << value << ", should be greateer than 0 and less than 127";
+            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
+                                << ", should be greateer than 0 and less than 127";
+        }
+        gna_lib_async_threads_num = lib_threads;
+    });
+
+    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
+        if (value == PluginConfigParams::YES) {
+            gna_openmp_multithreading  = false;
+        } else if (value == PluginConfigParams::NO) {
+            gna_openmp_multithreading  = true;
+        } else {
+            log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+}
+
+/**
+ * @depricated Use the version with config parameter
+ */
+void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
+                             InferenceEngine::QueryNetworkResult& res) const {
+    QueryNetwork(network, {}, res);
+}
+
+void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
+                             const std::map<std::string, std::string>& config,
+                             InferenceEngine::QueryNetworkResult& res) const {
+    std::unordered_set<CNNLayer *> allLayers;
+    InferenceEngine::InputsDataMap inputs;
+
+    network.getInputsInfo(inputs);
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+
+    if (inputs.empty()) {
+        THROW_GNA_EXCEPTION << "Network is empty (GNA)\n";
+    }
+
+    auto const & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
+    if (secondLayers.empty()) {
+        THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n";
+    }
+
+    InferenceEngine::details::UnorderedDFS(allLayers,
+                                           secondLayers.begin()->second,
+                                           [&](CNNLayerPtr const layer) {
+                                                if (GNAPluginNS::GNAPlugin::LayerTypeFromStr(layer->type) != NO_TYPE) {
+                                                    res.supportedLayers.insert(layer->name);
+                                                }
+                                            }, false);
+    }
  
  intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
      if (current->insData.empty()) return nullptr;
@@ -2076,7 +2437,7 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi
                  } else {
                      IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out));
                      // same offsets
-                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, nextMemoryLayer.reserved_offset);
+                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0);
                  }
                  return;
              }
@@ -2119,6 +2480,13 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi
                                      });
                          if (included == concat_connection.end()) {
                              gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
+
+                            for (auto && inputLayer : concatLayerInfoItem.concatInputLayers) {
+                                if ( InferenceEngine::details::CaselessEq<std::string>()
+                                                                    (inputLayer.name, "input") ) {
+                                    bytes_alllocated_for_input[inputLayer.name] = ALIGN64(concatLayerInfoItem.reserved_size) - inputLayer.offset;
+                                }
+                            }
                          }
                          concatLayerInfo->second.output_allocation_flag = true;
                      }
@@ -2158,7 +2526,15 @@ intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) {
      return nullptr;
  }
  
-GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, size_t offset, int idx) {
+std::vector<void *>& GNAPlugin::get_ptr_inputs_global(std::string name) {
+    if (!ptr_inputs_global_id.count(name)) {
+        ptr_inputs_global_storage.push_front({});
+        ptr_inputs_global_id[name] = ptr_inputs_global_storage.begin();
+    }
+    return *ptr_inputs_global_id[name];
+}
+
+GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, int32_t offset, int idx) {
      // selecting particular input layers
      auto prevLayer = CNNNetPrevLayer(layer, idx);
  
@@ -2166,15 +2542,24 @@ GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *pt
  
      // real input not a memory input
      if (LayerInfo(prevLayer).isInput()) {
-        if (0 == bytes_alllocated_for_input) {
-            gnamem->push_value(&ptr_inputs_global.front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
-            bytes_alllocated_for_input = num_data_bytes_in;
+        if (0 == bytes_alllocated_for_input[prevLayer->name]) {
+            gnamem->push_value(&get_ptr_inputs_global(prevLayer->name).front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
+            bytes_alllocated_for_input[prevLayer->name] = num_data_bytes_in;
          }
-        if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input, 64)) {
-            THROW_IE_EXCEPTION << "Layer: " << layer->name << " Cannot bind pointer to already allocated input, due to size_allocated="
-                                  << bytes_alllocated_for_input << ", and size_requested=" << num_data_bytes_in;
+        if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input[prevLayer->name], 64)) {
+            THROW_GNA_EXCEPTION
+                << "Layer: " << layer->name
+                << " Cannot bind pointer to already allocated input(" << prevLayer->name
+                << "), due to size_allocated=" << bytes_alllocated_for_input[prevLayer->name]
+                << ", and size_requested=" << num_data_bytes_in;
          }
-        gnamem->bind_ptr(ptr, &ptr_inputs_global.front(), offset);
+
+        if (offset >= 0) {
+            gnamem->bind_ptr(ptr, &get_ptr_inputs_global(prevLayer->name).front(), offset);
+        } else {
+            gnamem->bind_ptr(&get_ptr_inputs_global(prevLayer->name).front(), ptr, -offset);
+        }
+
          return prevLayer;
      }
  
@@ -2213,7 +2598,7 @@ GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *pt
                                                      prevLayer->name);
          if (concatLayerInfo != concat_connection.end()) {
              auto & concatLayerInfoItem = concatLayerInfo->second;
-            // dnnLayer that is input for concat output layer
+            // dnnLayer that is input for concat layer
              gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
              // return layer over concat
              return CNNNetPrevLayer(prevLayer);