Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / gna_plugin.cpp
index 620aa48..fc57d52 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -131,7 +131,7 @@ void GNAPlugin::copyInputData(T *dst,
         for (uint32_t i = 0; i < num_frames; i++) {
             for (uint32_t j = 0; j < num_vector_elements; j++) {
                 if (!std::is_same<T, U>::value) {
-                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * input_scale_factor);
+                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * get_input_scale_factor());
                 } else {
                     dst[j * num_group + i] = src[i * num_vector_elements + j];
                 }
@@ -154,7 +154,7 @@ void GNAPlugin::copyInputData(T *dst,
                 U *ptr_src_vec = const_cast<U *>(reinterpret_cast<const U *>(src) + i * num_vector_elements);
                 std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
                 for (int j=0; j < num_vector_elements; j++) {
-                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * input_scale_factor);
+                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * get_input_scale_factor());
                 }
             }
 
@@ -189,9 +189,13 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst,
     for (auto&& outputLayer : splitInfo.splitOutputLayers) {
         uint32_t begin = outputLayer.offset/precision_size;
         uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size;
+        if (dst_ptr - dst >= end) {
+            // output layer with bind pointer as previous one. Skip
+            continue;
+        }
         for (uint32_t i = begin; i < end; ++i) {
             if (!std::is_same<T, U>::value) {
-                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * input_scale_factor);
+                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * get_input_scale_factor());
             } else {
                 *(dst_ptr++) = *(src_ptr++);
             }
@@ -285,46 +289,39 @@ void GNAPlugin::ImportFrames(
                   uint32_t num_group,
                   uint32_t num_vector_elements,
                   uint32_t num_vector_stride) {
-    // special case if split/slice layers connected
-    // with Input detected
-    auto it = split_connection.end();
-    if (split_connection.size() != 0) {
-        it = std::find_if(split_connection.begin(), split_connection.end(), []
-                    (const std::pair<std::string, GNASplitLayer> &item) -> bool {
-                        return CaselessEq<std::string>()(item.second.splitInputLayer.name, "Input");
-                    });
-    }
     if (orientation == kDnnInterleavedOrientation) {
         // TODO : fix that as well
-        if (input_precision.size() == 2) {
+        if (input_precision == Precision::U8) {
+            int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+            uint8_t *src = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(ptr_src));
+            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+        } else if (input_precision.size() == 2) {
             int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
             int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
-            if (it != split_connection.end()) {
-                copyInputDataWithSplit(dst, src, it->second, input_precision.size());
-            } else {
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
-            }
+            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
         } else if (input_precision.size() == 4) {
             if (!gnadevice) {
                 float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
                 float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
-                if (it != split_connection.end()) {
-                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
-                } else {
-                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
-                }
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
             } else {
                 int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
                 const float *src = reinterpret_cast<const float *>(ptr_src);
-                if (it != split_connection.end()) {
-                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
-                } else {
-                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
-                }
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
             }
         }
     } else {
-        if (input_precision.size()== 2) {
+        if (input_precision == Precision::U8) {
+            uint8_t *src = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(ptr_src));
+            if (!gnadevice) {
+                float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            } else {
+                int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            }
+
+        } else if (input_precision.size()== 2) {
             int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
             int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
             copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
@@ -342,9 +339,8 @@ void GNAPlugin::ImportFrames(
     }
 }
 
-void GNAPlugin::fillMemoryConnections(std::map<std::string,
-                                            std::vector<InferenceEngine::CNNLayerPtr>>&
-                                                                            memoryPairs) {
+void GNAPlugin::fillMemoryConnections(std::unordered_map<std::string,
+                                            std::vector<InferenceEngine::CNNLayerPtr>>& memoryPairs) {
     for (auto &memory : memoryPairs) {
         auto inputLayer = memory.second[1];
         auto outputLayer = memory.second[0];
@@ -401,7 +397,7 @@ void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
     LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput);
     for (size_t i = 0; i < layer->outData.size(); ++i) {
         size_t padding = 0;
-        size_t layer_size = 0;
+        size_t output_layer_size = 0;
         auto& dataOutput = layer->outData[i];
 
         if (!dataOutput || !dataInput) {
@@ -416,16 +412,19 @@ void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
 
             padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize())
                                                         * dataOutput->precision.size();
-            layer_size =
+            output_layer_size =
                     InferenceEngine::details::product(begin(dataOutput->dims),
                                                      end(dataOutput->dims)) * dataOutput->precision.size();
 
-            layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, layer_size);
+            if (ptrSplitLayerOutput->type == "AffineFilter") {
+                size_t aligned64_offset = ptrSplitLayerOutput->GetParamAsInt("offset");
+                layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, aligned64_offset, output_layer_size);
+            } else {
+                layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, output_layer_size);
+            }
         }
 
-        split_size += ptrSplitLayerInputLayerInfo.isInput() ?
-                                ALIGN64(padding + layer_size):
-                                        padding + layer_size;
+        split_size += padding + output_layer_size;
     }
     layerInfoItem.reserved_size = split_size;
     layerInfoItem.splitInputLayer =
@@ -717,9 +716,9 @@ void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto &currentComponent = dnnComponentsForLayer.back().second;
     dnn.InitCopyComponent(currentComponent,
                           orientation,
-                          num_rows_in + num_padding_in,
+                          ALIGN(num_rows_in, 8),
                           num_columns_in,
-                          num_rows_out + num_padding_out,
+                          ALIGN(num_rows_out, 8),
                           num_columns_out,
                           inputs->precision.size(),
                           outputs->precision.size(),
@@ -732,7 +731,7 @@ void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
     size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
                                                             begin(outputs->dims), end(outputs->dims)), 8)
                                                                                 * outputs->precision.size();
-    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding_in) * inputs->precision.size();
+    size_t num_data_bytes_in = num_columns_in * ALIGN(num_rows_in, 8) * inputs->precision.size();
 
     connectInput(layer, ptr_inputs, num_data_bytes_in);
     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
@@ -757,13 +756,23 @@ void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
         THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported";
     }
 
+    auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
     for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) {
         if ( LayerInfo(outLayer.second).isConcat() ) {
-            auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
             connectOutput(layer, &concatLayerInfo.gna_ptr,
                           &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size);
         }
     }
+
+    size_t idx = 0;
+    for (auto && inputLayer : concatLayerInfo.concatInputLayers) {
+        if ( InferenceEngine::details::CaselessEq<std::string>()
+                                            (inputLayer.name, "input") ) {
+            connectInput(layer, &concatLayerInfo.gna_ptr,
+                                concatLayerInfo.reserved_size-inputLayer.offset, static_cast<int32_t>(-inputLayer.offset), idx);
+        }
+        ++idx;
+    }
 }
 
 void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
@@ -780,9 +789,9 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
 
     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
     size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
-    size_t cropSize = cropLayer->dim.back() * cropLayer->precision.size();
+    size_t cropOutputSize = cropLayer->dim.back() * cropLayer->precision.size();
 
-    if (ALIGN(cropOffset, 8) == cropOffset) {
+    if (ALIGN64(cropOffset) == cropOffset) {
         // leave crop as it is
         GNAPlugin::GNACropLayer cropLayerInfoItem(layer);
         std::string& id = layer->name;
@@ -795,13 +804,13 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
         }
 
         // calculate index idx for connectInput last parameter
-        connectInput(layer, &cropLayerInfo->second.gna_ptr, cropSize + cropOffset, cropOffset, 0);
+        connectInput(layer, &cropLayerInfo->second.gna_ptr, cropOutputSize + cropOffset, cropOffset, 0);
 
         // cases for certain output layers
         for (auto &&outLayer : layer->outData.front()->getInputTo()) {
             auto& nextLayer = outLayer.second;
             if ( LayerInfo(nextLayer).isConcat() ) {
-                connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropSize);
+                connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropOutputSize);
             }
         }
     } else {
@@ -842,30 +851,16 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
                                           begin(outputs->dims), end(outputs->dims)) * 4;
 
         size_t num_data_bytes_in = num_columns_in *
-        (num_rows_in + num_padding) * inputs->precision.size();
+                ALIGN(num_rows_in, 8) * inputs->precision.size();
 
         connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
         connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
 
-        gnamem->readonly().push_initializer(ptr_weights, num_rows_out * (num_rows_in + num_padding)*layer->precision.size(), [=](void * data, size_t size) {
-            int out = 0;
-            for (int input = cropLayer->offset.back(); input < num_rows_out + cropLayer->offset.back(); ++input) {
-                auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * (num_rows_in+num_padding) * layer->precision.size();
-                if (quantized == nullptr) {
-                    auto float_ptr = reinterpret_cast<float *>(mem_ptr);
-                    *float_ptr = 1.0f;
-                } else {
-                    auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
-                    *int_ptr = 1;
-                }
-                ++out;
-            }
-        }, 64);
-        if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
-        } else {
+        FillWeightOfAligningFilter(layer, ptr_weights, cropLayer->offset.back(), (quantized == nullptr) ? false : true);
+
+        (quantized == nullptr) ?
+            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64):
             gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
-        }
     }
 }
 
@@ -907,6 +902,7 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
     uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1);
     uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2);
     uint32_t num_rows_out = num_rows_in;
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 
     void *ptr_inputs;
     void *ptr_outputs;
@@ -916,9 +912,9 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
     dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
     auto &currentComponent = dnnComponentsForLayer.back().second;
     dnn.InitAffineComponent(currentComponent,
-                            num_rows_in,
+                            num_rows_in + num_padding,
                             num_columns_in,
-                            num_rows_out,
+                            num_rows_out + num_padding,
                             inputs2Bytes->precision.size(),
                             outputs->precision.size(),
                             // TODO: only fp32 and Int16 tested
@@ -936,11 +932,11 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
     cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
 #endif
 
-    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
-        * outputs->precision.size();
+    size_t num_data_bytes_out =
+        InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) * outputs->precision.size();
 
-    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs2Bytes->dims), end(inputs2Bytes->dims))
-        * inputs2Bytes->precision.size();
+    size_t num_data_bytes_in =
+        num_columns_in * (num_rows_in + num_padding) * inputs2Bytes->precision.size();
 
     connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
     connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx);
@@ -955,6 +951,7 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
                 #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
 
                 auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+
                 gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
             }
             connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
@@ -1028,19 +1025,25 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
     auto transpose = false;
     auto transposedRows = 0;
     auto transposedCols = 0;
-    /**
-     * TODO: enable transpose correction between Conv/affine layers implement dedicated pass
-     * TF topologies have inplace permutes so we dont care
-     * kaldi topologies did this internally
-     */
+
     if (0 && connectionInfo.needTransposeWeights) {
-        gnalog() << "Transposing weights for layer: " << layer->name << "\n";
         // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1
         auto permuteOrder = connectionInfo.permute->GetParamAsInts("order");
         if (permuteOrder != vector<int>({0, 3, 2, 1})) {
             THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
                                ", but only support 0, 3, 2, 1";
         }
+
+        /**
+         * TODO: weights transpose happened after quantisation might result in poor quality for in 8 - move this to passes
+         */
+        if (weightable._weights->precision() == Precision::I8) {
+            THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute operation for 8 bit weights for layer: " << layer->name;
+        }
+
+        // this affine connected to convolution via pool or activation
+        gnalog() << "Transposing weights for layer: " << layer->name << "\n";
+
         transpose = !isDiag;
         transposedRows = connectionInfo.permute->input()->getDims()[3];
         transposedCols = connectionInfo.permute->input()->getDims()[1];
@@ -1053,7 +1056,6 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
                                         weightable._weights->byteSize(),
                                         64);
         } else {
-            // ToDO: write unit tests for transpose
             gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) {
                 for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
                     auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
@@ -1063,13 +1065,16 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
                         for (int i = 0; i < transposedRows; i++) {
                             auto offsetWrite = (transposedRows * j + i) * weightable.precision.size();
                             auto offsetRead = (i * transposedCols + j) * weightable.precision.size();
-                            memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
+                            std::memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
                         }
                     }
                 }
             }, 64);
         }
     } else {
+        if (transpose) {
+            THROW_GNA_EXCEPTION << "transpozed weights with non zero padding not yet supported";
+        }
         auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
         auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
         auto paddedWeightsSize = paddedWeights * weightable.precision.size();
@@ -1094,6 +1099,123 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag)
     }
 }
 
+void GNAPlugin::FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized) {
+    auto outputs = *layer->outData.begin();
+    auto inputs = layer->insData.begin()->lock();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+
+    if (!ptrWeights) {
+        THROW_GNA_EXCEPTION << "Weights memory is not allocated!!!";
+    }
+
+    gnamem->readonly().push_initializer(ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void * data, size_t size) {
+        int out = 0;
+        for (int input = offset; input < num_rows_out + offset; ++input) {
+            auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * ALIGN(num_rows_in, 8) * layer->precision.size();
+            if (!isQuantized) {
+                auto float_ptr = reinterpret_cast<float *>(mem_ptr);
+                *float_ptr = 1.0f;
+           } else {
+               auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
+               *int_ptr = 1;
+           }
+            ++out;
+        }
+    }, 64);
+}
+
+void GNAPlugin::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto filterLayer = dynamic_cast<InferenceEngine::WeightableLayer *> (layer.get());
+
+    if (filterLayer == nullptr) {
+        return;
+    }
+
+    std::string& name = filterLayer->name;
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    // we look for this concat layer pointer in extra concat map
+    auto prevLayer = CNNNetPrevLayer(layer.get(), 0);
+    if (!LayerInfo(prevLayer).isSplit() && !LayerInfo(prevLayer).isSlice()) {
+        THROW_GNA_EXCEPTION << "Case  with Affine Aligning Filter for not Split/Slice layers is not implemented yet!";
+    }
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    auto outputs = *layer->outData.begin();
+    auto inputs = layer->insData.begin()->lock();
+
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
+
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+    gnalog() << "Filter " << layer->name << " is being inserted...\n";
+    auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->precision() : outputs->precision;
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in + num_padding,
+                            num_columns_in,
+                            num_rows_out,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            filterLayer->_weights->precision().size(),
+                            biasPrecision.size(),
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            false);
+
+    size_t num_data_bytes_out =
+                InferenceEngine::details::product(
+                                        begin(outputs->dims), end(outputs->dims)) * 4;
+
+    size_t num_data_bytes_in = num_columns_in *
+                            ALIGN(num_rows_in, 8) * inputs->precision.size();
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    if (num_padding == 0) {
+        gnamem->readonly().push_ptr(ptr_weights,
+                                filterLayer->_weights->cbuffer().as<const void *>(),
+                                filterLayer->_weights->byteSize(),
+                                                            64);
+    } else {
+        auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
+        auto paddedWeights = elementsIn * num_rows_out;
+        auto paddedWeightsSize = paddedWeights * filterLayer->precision.size();
+
+        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
+            for (int i = 0; i < num_rows_out; i++) {
+                std::memcpy(data,
+                       filterLayer->_weights->cbuffer().as<const uint8_t *>() + num_rows_in * i * filterLayer->precision.size(),
+                       num_rows_in * filterLayer->precision.size());
+                data = reinterpret_cast<uint8_t *>(data) + (num_rows_in + num_padding) * filterLayer->precision.size();
+            }
+        }, 64);
+    }
+
+    if (filterLayer->_biases) {
+        gnamem->readonly().push_ptr(ptr_biases,
+                         filterLayer->_biases->cbuffer().as<const void *>(),
+                         filterLayer->_biases->byteSize(),
+                         64);
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
 void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto *generic = dynamic_cast<GenericLayer *>(layer.get());
     std::string type;
@@ -1269,6 +1391,7 @@ void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
         {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}},  // skip input layers they are not used in GNA lib, only as a memory blobs
         {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)},
         {{"ScaleShift"}, CREATE(DiagonalPrimitive)},
+        {{"AffineFilter"}, CREATE(AffineFilterPrimitive)},
         {{"Eltwise"},
          CREATE(EltwisePrimitive)},  // same as diagonal while weights are not taken from network, rather than from another output
         {{"Split"}, SKIP},  // skip information about which part of prev layer need to consume handle during layer creation
@@ -1293,109 +1416,10 @@ void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
 
 
 GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
-    // holds actual value of a found key
-    std::string value;
-    auto if_set = [&](std::string key, const std::function<void()> & handler) {
-        auto keyInMap = configMap.find(key);
-        if (keyInMap != configMap.end()) {
-            value = keyInMap->second;
-            handler();
-        }
-    };
-
-    if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
-        input_scale_factor = std::stod(value);
-    });
-
-    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
-        dumpXNNPath = value;
-    });
-
-    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
-        static caseless_unordered_map <std::string, uint32_t> supported_values = {
-            {GNAConfigParams::GNA_AUTO, GNA_AUTO},
-            {GNAConfigParams::GNA_HW, GNA_HARDWARE},
-            {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
-            {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
-        };
-        auto procType = supported_values.find(value);
-        if (procType == supported_values.end()) {
-            THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
-        }
-        gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
-    });
-
-    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
-        if (value == PluginConfigParams::YES) {
-            compact_mode = true;
-        } else if (value == PluginConfigParams::NO) {
-            compact_mode = false;
-        } else {
-            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
-        }
-    });
-
-    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
-        if (value == PluginConfigParams::YES) {
-            exclusive_async_requests  = true;
-        } else if (value == PluginConfigParams::NO) {
-            exclusive_async_requests  = false;
-        } else {
-            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
-        auto precision = Precision::FromStr(value);
-        if (precision != Precision::I8 && precision != Precision::I16) {
-            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
-        }
-        gnaPrecision = precision;
-    });
-
-    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
-        if (value == PluginConfigParams::YES) {
-            uniformPwlDesign = true;
-        } else if (value == PluginConfigParams::NO) {
-            uniformPwlDesign = false;
-        } else {
-            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
-                                                            << "should be equal to YES/NO, but not" << value;
-        }
-    });
-
-    if_set(CONFIG_KEY(PERF_COUNT), [&] {
-        if (value == PluginConfigParams::YES) {
-            performance_counting = true;
-        } else if (value == PluginConfigParams::NO) {
-            performance_counting = false;
-        } else {
-            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
-                                                            << "should be equal to YES/NO, but not" << value;
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
-        uint64_t lib_threads = std::stoul(value, NULL, 10);
-        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
-            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
-                                                            << ", should be greateer than 0 and less than 127";
-        }
-        gna_lib_async_threads_num = lib_threads;
-    });
-
-    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
-        if (value == PluginConfigParams::YES) {
-            gna_openmp_multithreading  = false;
-        } else if (value == PluginConfigParams::NO) {
-            gna_openmp_multithreading  = true;
-        } else {
-            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-        }
-    });
+    SetConfig(configMap);
 }
 
-GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) {
+GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) const {
     static const caseless_map<std::string, GNAPlugin::LayerType> LayerNameToType = {
         { "Input" , Input },
         { "Convolution" , Convolution },
@@ -1433,13 +1457,14 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage
     auto network_precision = network.getPrecision();
     network.getInputsInfo(inputs);
     auto network_input_precision = inputs.begin()->second->getInputPrecision();
-    auto batch_sise = network.getBatchSize();
+    auto batch_size = network.getBatchSize();
     if (network_precision != Precision::FP32) {
         errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n";
         return false;
     }
     if (network_input_precision != Precision::FP32 &&
-        network_input_precision != Precision::I16) {
+        network_input_precision != Precision::I16 &&
+        network_input_precision != Precision::U8) {
         errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n";
         return false;
     }
@@ -1469,7 +1494,9 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage
                                                     errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n";
                                                     check_result =  false;
                                                 }
-                                                if (batch_sise != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
+                                                if (batch_size != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
+                                                    errMessage = "topology with layer: " + layer->name + ", type: " + layer->type +
+                                                                 ", and batch size(" + to_string(batch_size) + ") != 1 not supported";
                                                     check_result =  false;
                                                 }
                                             }, false);
@@ -1477,6 +1504,10 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage
     return check_result;
 }
 
+float GNAPlugin::get_input_scale_factor() const {
+    return input_scale_factor.empty() ? 1.0 : input_scale_factor.begin()->second;
+}
+
 void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     //  Check the input network
     std::string error;
@@ -1490,21 +1521,34 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
         substitutePRelu(layers);
         layers = CNNNetSortTopologically(*network.get());
         reorderMaxPool(layers);
-        applyOrientations(layers);
+        //  ToDo sort if bool flag "changed"
+        //  returned from insertion function
+        insertAligningFilterLayer(layers);
+
+#if ENABLE_AUTO_PERMUTE
+        layers = CNNNetSortTopologically(*network.get());
+        reversePermutations(layers);
+#endif
+        layers = CNNNetSortTopologically(*network.get());
         insertIdentityLayer(layers);
+        layers = CNNNetSortTopologically(*network.get());
+        insertCopyLayer(layers);
+        layers = CNNNetSortTopologically(*network.get());
         insertDiagonalLayer(layers);
+        layers = CNNNetSortTopologically(*network.get());
+        substituteScaleShiftBroadCast(layers);
     };
 
     Config supported = Config({
         {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr {
             if (gnaPrecision == Precision::I16) {
                 ModelQuantizer<QuantI16> q;
-                return q.quantize(network, run_passes, input_scale_factor);
+                return q.quantize(network, run_passes, get_input_scale_factor());
             }
 
             if (gnaPrecision == Precision::I8) {
                 ModelQuantizer<QuantI8> q;
-                return q.quantize(network, run_passes, input_scale_factor);
+                return q.quantize(network, run_passes, get_input_scale_factor());
             }
             THROW_GNA_EXCEPTION << "no mans land for GNA precision";
         }},
@@ -1529,24 +1573,13 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
 
     supported.setDefaultDevice(TargetDevice::eGNA);
     auto newNet = supported.find_configuration(network).convert(network);
-    auto networkPrecision = newNet->getPrecision();
 
-    if (!networkPrecision.is_float()) {
-        gnadevice.reset(new GNADeviceHelper(gna_proc_type,
-                                            gna_lib_async_threads_num,
-                                            gna_openmp_multithreading,
-                                            performance_counting));
-        gnamem.reset(new gna_memory_type(
-                    make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
-    } else {
-        gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
-    }
+
 
     // creating intel dnn_t structures from network
     auto sortedNet = CNNNetSortTopologically(*newNet);
     std::vector<CNNLayerPtr> sortedNoMem;
-    std::map<std::string,
-                    std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
+    std::unordered_map<std::string, std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
     // find all memory layers pairs and mark which one used as outputs
     for (auto &layer : sortedNet) {
         auto generic = dynamic_cast<GenericLayer *>(layer.get());
@@ -1572,16 +1605,28 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     // fill in extra storage with memory layers
     fillMemoryConnections(memoryPairs);
 
+    if (memory_connection.size() != 0) {
+        gna_lib_async_threads_num = 1;
+    }
+
+    auto networkPrecision = newNet->getPrecision();
+
+    if (!networkPrecision.is_float()) {
+        gnadevice.reset(new GNADeviceHelper(gna_proc_type,
+                                            gna_lib_async_threads_num,
+                                            gna_openmp_multithreading,
+                                            performance_counting));
+        gnamem.reset(new gna_memory_type(
+                make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
+    } else {
+        gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
+    }
+
     // keep inputs information and create input primitives
     newNet->getInputsInfo(inputsDataMap);
     if (inputsDataMap.empty()) {
         THROW_GNA_EXCEPTION << " No inputs for the topology";
     }
-    if (inputsDataMap.size() != 1) {
-        THROW_GNA_EXCEPTION << " cannot infer topologies with more than one inputs";
-    }
-
-    inputDims = inputsDataMap.begin()->second->getDims();
 
     // keep output dims
     newNet->getOutputsInfo(outputsDataMap);
@@ -1593,7 +1638,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     }
     outputDims = outputsDataMap.begin()->second->dims;
 
-    ptr_inputs_global.resize(gna_lib_async_threads_num);
+    for (auto && input : inputsDataMap) {
+        get_ptr_inputs_global(input.first).resize(gna_lib_async_threads_num);
+    }
+
     ptr_outputs_global.resize(gna_lib_async_threads_num);
     // CreatingLayer primitives
     // TODO: solely gna_example convolution hack
@@ -1601,11 +1649,25 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) {
         CreateLayerPrimitive(*layer);
     }
-    gnamem->bind_ptr(&ptr_outputs_global.front(), &dnnComponentsForLayer.back().second.ptr_outputs);
+    DnnComponentsForLayer::iterator output_component = std::find_if(dnnComponentsForLayer.begin(),
+                                                        dnnComponentsForLayer.end(),
+                                                        [&](const std::pair<std::string, intel_dnn_component_t>& v)
+                                                        { return outputsDataMap.begin()->first == v.first; });
+
+    if (output_component == dnnComponentsForLayer.end()) {
+        if (dnnComponentsForLayer.empty()) {
+            THROW_GNA_EXCEPTION << "No outputs found in internal structures";
+        }
+        // likely layer is fused. Take last one
+        output_component = std::prev(dnnComponentsForLayer.end());
+        gnalog() << "Output layer "<< outputsDataMap.begin()->first
+                    << " has not been found in component list. Took  "
+                    << output_component->first << " instead \n" << std::flush;
+    }
+    gnamem->bind_ptr(&ptr_outputs_global.front(), &output_component->second.ptr_outputs);
 
     // make room for active list
-    auto &last_component = dnnComponentsForLayer.back().second;
-    gnamem->reserve_ptr(nullptr, ALIGN64(last_component.num_bytes_per_output * last_component.num_rows_out));
+    gnamem->reserve_ptr(nullptr, ALIGN64(output_component->second.num_bytes_per_output * output_component->second.num_rows_out));
 
     void *pParallelExecutionData  = nullptr;
 
@@ -1630,16 +1692,16 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     // in fp32 mode last PWL cannot be computed without that
     dnn.InitActiveList(NULL);
 
-    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap()));
 
     if (!networkPrecision.is_float()) {
         // number of layer gets calculated inside that InitGNAStruct function
         dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj);
     }
 
-    // creating same gna RW segment for paralle infer requests
+    // creating same gna RW segment for parallel infer requests
     for (int i = 1; i != gna_lib_async_threads_num; i++) {
-        nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+        nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap()));
 
         // this can be improved by just copy all structures, but we are too lazy
         dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj);
@@ -1656,7 +1718,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
             }
         };
 
-        relocate(ptr_inputs_global[i], ptr_inputs_global[0]);
+        for (auto &&input : ptr_inputs_global_storage) {
+            relocate(input[i], input[0]);
+        }
+
         relocate(ptr_outputs_global[i], ptr_outputs_global[0]);
         for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) {
             auto & layer = std::get<0>(nnets[i])->obj.pLayers[j];
@@ -1666,11 +1731,60 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
             relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate);
         }
     }
-    orientation_in = dnn.component[0].orientation_in;
-    orientation_out = dnn.component[dnn.num_components()-1].orientation_out;
-    num_bytes_per_output = dnn.component[dnn.num_components()-1].num_bytes_per_output;
 
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(sortedNoMem.back());
+    // calculating input orientation without memory layers, since their orientation not changed during infer right now
+    std::unordered_map<string, string> skippedLayers;
+    for (auto &layer : sortedNet) {
+        for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) {
+            auto prevLayer = CNNNetPrevLayer(layer.get(), i);
+            if (!skippedLayers.count(prevLayer->name)) {
+                if (CNNNetHasPrevLayer(prevLayer.get())) {
+                    continue;
+                }
+
+                // we are in the one of input layers
+                if (LayerInfo(prevLayer).isMemory()) {
+                    continue;
+                }
+            }
+
+            auto dnnLayer = findDnnLayer(layer);
+            string inputName = prevLayer->name;
+            if (skippedLayers.count(prevLayer->name)) {
+                inputName = skippedLayers[prevLayer->name];
+            }
+
+            // non functional layer - skipped by gna
+            if (nullptr == dnnLayer) {
+                // storing input name for skipped layer
+                skippedLayers[layer->name] = inputName;
+                continue;
+            }
+
+            // input orientation might be already initialized, thus verify that it matches
+            if (!orientation_in.count(inputName)) {
+                orientation_in[inputName] = dnnLayer->orientation_in;
+            } else {
+                if (orientation_in[inputName] != dnnLayer->orientation_in) {
+                    THROW_GNA_EXCEPTION << "orientation for input layer: " << inputName << "cannot be calculated";
+                }
+            }
+        }
+    }
+
+    orientation_out = output_component->second.orientation_out;
+    num_bytes_per_output = output_component->second.num_bytes_per_output;
+
+    // find output layer
+    auto output = std::find_if(sortedNet.begin(),
+                                sortedNet.end(),
+                                [&](const CNNLayerPtr& v)
+                                { return outputsDataMap.begin()->first == v.get()->name; });
+    if (output == sortedNet.end()) {
+        // likely layer is fused. Take last one
+        output = std::prev(sortedNet.end());
+    }
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(*output);
     output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
 
     num_rotate_rows = dnn.num_rotate_rows;
@@ -1692,7 +1806,7 @@ void GNAPlugin::DumpXNNToFile() const {
         }
         auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
         dump.header.rw_region_size = gnamem->getRWBytes();
-        dump.header.input_scaling_factor = input_scale_factor;
+        dump.header.input_scaling_factor = get_input_scale_factor();
         dump.header.output_scaling_factor = output_scale_factor;
         std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
         dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
@@ -1726,69 +1840,81 @@ void RotateFeatures(uint8_t *ptr_feat,
     }
 }
 
-uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
-    return QueueInference(*input.begin()->second.get(), result);
-
-    /*if (!syncPoints.empty()) {
-        syncPoints.back().second = result;
-    }*/
-}
-
-uint32_t GNAPlugin::QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result) {
-    auto inputLayout = input.layout();
-    if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
-        THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " << input.layout();
-    }
-    if (inputLayout == NCHW) {
-        inputLayout = NC;
-    }
-    auto is2D = input.layout() ==  Layout::NC || input.layout() == Layout ::CN;
-
+uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) {
     auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) {
         return std::get<1>(item) == -1;
     });
 
     if (freeNnet == nnets.end()) {
-        THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
-                           << "GNA executable network has max of " << static_cast<uint32_t >(gna_lib_async_threads_num)
-                           << " parallel infer requests, please sync one of already running";
+        if (memory_connection.size() != 0) {
+            Wait(0);
+            freeNnet = nnets.begin();
+        } else {
+            THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
+                               << "GNA executable network has max of "
+                               << static_cast<uint32_t >(gna_lib_async_threads_num)
+                               << " parallel infer requests, please sync one of already running";
+        }
     }
 
+
     auto nnet = std::get<0>(*freeNnet).get();
     auto idx = static_cast<uint32_t>(std::distance(std::begin(nnets), freeNnet));
 
-    if (ptr_inputs_global[idx] == nullptr) {
-        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-        THROW_GNA_EXCEPTION << "network not loaded : global input pointer not set";
-    }
+    for (auto &input : inputs) {
+        auto inputLayout = input.second->layout();
+        if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
+            THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: "
+                                << input.second->layout();
+        }
+        if (inputLayout == NCHW) {
+            inputLayout = NC;
+        }
+        auto is2D = input.second->layout() == Layout::NC || input.second->layout() == Layout::CN;
 
-    if (orientation_in == kDnnUnknownOrientation) {
-        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-        THROW_GNA_EXCEPTION << "network not loaded : input orientation not set";
-    }
+        if (!ptr_inputs_global_id.count(input.first)) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set";
+        }
 
-    if (orientation_out == kDnnUnknownOrientation) {
-        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-        THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
-    }
+        if (get_ptr_inputs_global(input.first)[idx] == nullptr) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #"
+                                << idx << " not set";
+        }
 
-    ImportFrames(ptr_inputs_global[idx],
-                 input.cbuffer().as<float *>(),
-                 input.precision(),
-                 orientation_in,
-                 input.dims()[input.dims().size() - 1],
-                 is2D ? input.dims()[1] : input.dims()[input.dims().size() - 1],
-                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],
-                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2]);
+        if (orientation_in[input.first] == kDnnUnknownOrientation) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set";
+        }
+
+        if (orientation_out == kDnnUnknownOrientation) {
+            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+            THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
+        }
 
-    if ((inputLayout == Layout::NC || inputLayout == Layout::NCHW) != (orientation_in == kDnnInterleavedOrientation)) {
-        RotateFeatures(reinterpret_cast<uint8_t*>(ptr_inputs_global[idx]),
-                       gnadevice ? 2 : 4,
-                       // TODO: only works for cnn4a and google command so far
-                       input.dims()[input.dims().size() - 1],
-                       is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],  // num_feature_vectors looks batch should be there
-                       num_rotate_rows,
-                       num_rotate_columns);
+        auto dims = input.second->dims();
+
+        ImportFrames(get_ptr_inputs_global(input.first)[idx],
+                     input.second->cbuffer().as<float *>(),
+                     input.second->precision(),
+                     orientation_in[input.first],
+                     dims[dims.size() - 1],
+                     is2D ? dims[1] : dims[dims.size() - 1],
+                     is2D ? dims[0] : dims[0] * dims[1] * dims[2],
+                     is2D ? dims[0] : dims[0] * dims[1] * dims[2]);
+        bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1;
+        if (((inputLayout == Layout::NC || inputLayout == Layout::NCHW)
+            != (orientation_in[input.first] == kDnnInterleavedOrientation))
+            && !isOneChannel) {
+            RotateFeatures(reinterpret_cast<uint8_t *>(get_ptr_inputs_global(input.first)[idx]),
+                           gnadevice ? 2 : 4,
+                           // TODO: only works for cnn4a and google command so far
+                           dims[dims.size() - 1],
+                           is2D ? dims[0] : dims[0] * dims[2],  // num_feature_vectors looks batch should be there
+                           num_rotate_rows,
+                           num_rotate_columns);
+        }
     }
 
     if (!gnadevice) {
@@ -1810,7 +1936,7 @@ void GNAPlugin::Wait(uint32_t idx) {
     }
 
     std::get<1>(nnets[idx]) = -1;
-    auto & output = *std::get<2>(nnets[idx]).begin()->second;
+    auto & result = std::get<2>(nnets[idx]);
 #ifdef PLOT
     dnn.BeginNewWrite();
     if (dnn.num_components() != 0) {
@@ -1819,18 +1945,38 @@ void GNAPlugin::Wait(uint32_t idx) {
     }
     dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj);
 #endif
+    if (result.size() != 1) {
+        THROW_GNA_EXCEPTION << "Invalid number of outputs for infer request: " << result.size() << ",  only 1 supported";
+    }
+    auto & output = *result.begin()->second;
 
     if (output.layout() == Layout::NC) {
         // TODO: rotate can be incorporated with exporting - used only in unit tests so far
         // TODO: restore:
 //        if (orientation_out != kDnnInterleavedOrientation) {
+//            if (inputs.size() != 1) {
+//                THROW_GNA_EXCEPTION << "Invalid number of inputs for  for deinterleave " << inputs.size()
+//                                    << ", only 1 supported";
+//            }
+//            auto dims = inputs.begin()->second->dims();
 //            RotateFeatures(reinterpret_cast<uint8_t*>(ptr_outputs_global),
 //                           gnadevice ? 2 : 4,
-//                           input.dims()[input.dims().size() - 1],
-//                           input.dims()[0],  // num_feature_vectors looks batch should be there
-//                           input.dims()[0],
-//                           input.dims()[input.dims().size() - 1]);
+//                           dims[dims.size() - 1],
+//                           dims[0],  // num_feature_vectors looks batch should be there
+//                           dims[0],
+//                           dims[dims.size() - 1]);
 //        }
+        // we concider the last layer as output ...
+        size_t output_layer_index = std::max(0, static_cast<int>(std::get<0>(nnets[idx])->obj.nLayers - 1));
+        if (gnadevice && std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].pOutputs != ptr_outputs_global[idx]) {
+            // ...as this is not true, we should look for output layer index
+            for (int j = 0; j != std::get<0>(nnets[idx])->obj.nLayers; j++) {
+                if (std::get<0>(nnets[idx])->obj.pLayers[j].pOutputs == ptr_outputs_global[idx]) {
+                    output_layer_index = j;
+                    break;
+                }
+            }
+        }
 
         ExportScores(output.buffer(),
                      ptr_outputs_global[idx],
@@ -1841,7 +1987,7 @@ void GNAPlugin::Wait(uint32_t idx) {
                      output.dims()[0],
                      output.dims()[0],
                      // TODO: create better getter consider multiple outputs case
-                     gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[std::get<0>(nnets[idx])->obj.nLayers - 1].nBytesPerOutput : sizeof(float),
+                     gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].nBytesPerOutput : sizeof(float),
                      sizeof(float));
     } else if (output.layout() != Layout::CN) {
         THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout();
@@ -1884,13 +2030,6 @@ void GNAPlugin::Wait(uint32_t idx) {
     }
 }
 
-
-void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
-    BlobMap result;
-    result["output"] = std::shared_ptr<Blob>(&output, [](Blob*){});
-    Wait(QueueInference(input, result));
-}
-
 void GNAPlugin::Reset() {
     for (auto && memLayer : memory_connection) {
         std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size);
@@ -1900,10 +2039,23 @@ void GNAPlugin::Reset() {
     }
 }
 
-void GNAPlugin::Infer(const BlobMap &inputs, BlobMap &result) {
-    auto &input = *inputs.begin()->second.get();
-    auto &output = *result.begin()->second.get();
-    Infer(input, output);
+void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
+    BlobMap bmInput;
+    BlobMap bmOutput;
+    if (inputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << "inputs";
+    }
+    if (outputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << outputsDataMap.size() << "outputs";
+    }
+
+    bmInput[inputsDataMap.begin()->first] = std::shared_ptr<Blob>(const_cast<Blob*>(&input), [](Blob*){});
+    bmOutput[outputsDataMap.begin()->first] = std::shared_ptr<Blob>(&output, [](Blob*){});
+    Infer(bmInput, bmOutput);
+}
+
+void GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
+    Wait(QueueInference(input, result));
 }
 
 Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
@@ -1914,10 +2066,11 @@ Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
     return outputBlob;
 }
 
-Blob::Ptr GNAPlugin::GetInputBlob(InferenceEngine::Precision precision) {
+Blob::Ptr GNAPlugin::GetInputBlob(std::string name, InferenceEngine::Precision precision) {
     InferenceEngine::Blob::Ptr inputBlob;
     // need to have intermediate blob for interleave conversion
     // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
+    auto inputDims = inputsDataMap[name]->getDims();
     inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims);
     inputBlob->allocate();
     return inputBlob;
@@ -1955,7 +2108,8 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
     auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt);
     serial.Import(basePtr, header.gnaMemSize, inputStream);
 
-    ptr_inputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
+
+    get_ptr_inputs_global("input").push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
     ptr_outputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.output.descriptor_offset));
 
     auto getOrientation = [](intel_nnet_layer_t & layer) {
@@ -1963,14 +2117,14 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
            kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
     };
 
-    orientation_in = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
+    orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
     orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]);
 
     num_bytes_per_output = header.output.element_size;
 
 
     outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup});
-    inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
+    auto inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
 
     inputsDataMap["input"] = std::make_shared<InputInfo>();
     inputsDataMap["input"]->setInputData(make_shared<Data>("input",
@@ -1983,7 +2137,7 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
                                                  Layout::NC);
 
     output_scale_factor = header.output.scaleFactor;
-    input_scale_factor = header.input.scaleFactor;
+    input_scale_factor["input"] = header.input.scaleFactor;
 
     num_rotate_rows = header.nRotateRows;
     num_rotate_columns = header.nRotateColumns;
@@ -2007,20 +2161,25 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
 }
 
 void GNAPlugin::Export(const std::string &fileName) {
-    if (ptr_inputs_global.empty() || ptr_outputs_global.empty()) {
+    if (ptr_inputs_global_id.empty() || ptr_outputs_global.empty()) {
         THROW_GNA_EXCEPTION << " network not loaded";
     }
 
+    if (ptr_inputs_global_id.size() != 1) {
+        THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported";
+    }
+
     std::fstream outStream(fileName, ios_base::out | ios_base::binary);
 
     // TODO: nnet group parameter looks only used in application - so can we move this line into load network.
+    auto inputDims = inputsDataMap.begin()->second->getDims();
     if (inputDims.size() == 2) {
         std::get<0>(nnets.front())->obj.nGroup = inputDims[1];
     }
 
     auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj,
-                   {input_scale_factor,
-                    ptr_inputs_global[0],
+                   {get_input_scale_factor(),
+                    ptr_inputs_global_storage.front()[0],
                     2,
                     static_cast<uint32_t>(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))},
                    {output_scale_factor,
@@ -2043,7 +2202,209 @@ void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::Infe
 }
 
 void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
-void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {}
+
+void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {
+    std::vector<std::string> supportedConfigOptions = {
+        GNA_CONFIG_KEY(SCALE_FACTOR),
+        GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE),
+        GNA_CONFIG_KEY(DEVICE_MODE),
+        GNA_CONFIG_KEY(COMPACT_MODE),
+        CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
+        GNA_CONFIG_KEY(PRECISION),
+        GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN),
+        CONFIG_KEY(PERF_COUNT),
+        GNA_CONFIG_KEY(LIB_N_THREADS),
+        CONFIG_KEY(SINGLE_THREAD)
+    };
+
+    for (auto& item : config) {
+        auto keys = std::find_if(supportedConfigOptions.begin(), supportedConfigOptions.end(), [&item](std::string supportedConfigOption) {
+            return item.first.find(supportedConfigOption) != std::string::npos;
+        });
+        if (keys == supportedConfigOptions.end()) {
+            THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first << " not supported";
+        }
+    }
+
+    // holds actual value of a found key
+    std::string key;
+    std::string value;
+    auto if_set = [&](std::string keyInput, const std::function<void()> & handler) {
+        auto keyInMap = config.find(keyInput);
+        if (keyInMap != config.end()) {
+            value = keyInMap->second;
+            handler();
+        }
+    };
+
+    auto if_start = [&](std::string keyInput, const std::function<void()> & handler) {
+        for (auto && c : config) {
+            if (c.first.find(keyInput) == 0) {
+                if (c.first.size() > keyInput.size() + 1) {
+                    key = c.first.substr(keyInput.size() + 1);
+                    value = c.second;
+                    handler();
+                }
+            }
+        }
+    };
+
+    auto fp32eq = [](float p1, float p2) -> bool {
+        return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+    };
+
+    auto & log = gnalog();
+
+    if_start(GNA_CONFIG_KEY(SCALE_FACTOR), [&, this] {
+        // only identical scale factors supported so far
+        auto ref = input_scale_factor.size() ? input_scale_factor.begin()->second : 1.0;
+        input_scale_factor[key] = std::stod(value);
+        if (ref != 1.0 && !fp32eq(input_scale_factor[key], ref)) {
+            std::string message = "only identical input scale factors supported, but provided: "
+                    + std::to_string(ref) + " and " + std::to_string(input_scale_factor[key]);
+            log << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key];
+            THROW_GNA_EXCEPTION << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key];
+        }
+    });
+
+    if (input_scale_factor.empty()) {
+        if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
+            input_scale_factor["placeHolder"] = std::stod(value);
+        });
+    }
+
+    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
+        dumpXNNPath = value;
+    });
+
+    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
+        static caseless_unordered_map <std::string, uint32_t> supported_values = {
+                {GNAConfigParams::GNA_AUTO, GNA_AUTO},
+                {GNAConfigParams::GNA_HW, GNA_HARDWARE},
+                {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
+                {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
+        };
+        auto procType = supported_values.find(value);
+        if (procType == supported_values.end()) {
+            log << "GNA device mode unsupported: " << value;
+            THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
+        }
+        gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
+    });
+
+    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
+        if (value == PluginConfigParams::YES) {
+            compact_mode = true;
+        } else if (value == PluginConfigParams::NO) {
+            compact_mode = false;
+        } else {
+            log << "GNA compact mode should be YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
+        if (value == PluginConfigParams::YES) {
+            exclusive_async_requests  = true;
+        } else if (value == PluginConfigParams::NO) {
+            exclusive_async_requests  = false;
+        } else {
+            log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
+        auto precision = Precision::FromStr(value);
+        if (precision != Precision::I8 && precision != Precision::I16) {
+            log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+        }
+        gnaPrecision = precision;
+    });
+
+    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
+        if (value == PluginConfigParams::YES) {
+            uniformPwlDesign = true;
+        } else if (value == PluginConfigParams::NO) {
+            uniformPwlDesign = false;
+        } else {
+            log << "GNA pwl uniform algorithm parameter "
+                << "should be equal to YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
+                                << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(PERF_COUNT), [&] {
+        if (value == PluginConfigParams::YES) {
+            performance_counting = true;
+        } else if (value == PluginConfigParams::NO) {
+            performance_counting = false;
+        } else {
+            log << "GNA performance counter enabling parameter "
+                << "should be equal to YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
+                                << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
+        uint64_t lib_threads = std::stoul(value, NULL, 10);
+        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
+            log << "Unsupported accelerator lib number of threads: " << value << ", should be greateer than 0 and less than 127";
+            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
+                                << ", should be greateer than 0 and less than 127";
+        }
+        gna_lib_async_threads_num = lib_threads;
+    });
+
+    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
+        if (value == PluginConfigParams::YES) {
+            gna_openmp_multithreading  = false;
+        } else if (value == PluginConfigParams::NO) {
+            gna_openmp_multithreading  = true;
+        } else {
+            log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+}
+
+/**
+ * @depricated Use the version with config parameter
+ */
+void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
+                             InferenceEngine::QueryNetworkResult& res) const {
+    QueryNetwork(network, {}, res);
+}
+
+void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
+                             const std::map<std::string, std::string>& config,
+                             InferenceEngine::QueryNetworkResult& res) const {
+    std::unordered_set<CNNLayer *> allLayers;
+    InferenceEngine::InputsDataMap inputs;
+
+    network.getInputsInfo(inputs);
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+
+    if (inputs.empty()) {
+        THROW_GNA_EXCEPTION << "Network is empty (GNA)\n";
+    }
+
+    auto const & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
+    if (secondLayers.empty()) {
+        THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n";
+    }
+
+    InferenceEngine::details::UnorderedDFS(allLayers,
+                                           secondLayers.begin()->second,
+                                           [&](CNNLayerPtr const layer) {
+                                                if (GNAPluginNS::GNAPlugin::LayerTypeFromStr(layer->type) != NO_TYPE) {
+                                                    res.supportedLayers.insert(layer->name);
+                                                }
+                                            }, false);
+    }
 
 intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
     if (current->insData.empty()) return nullptr;
@@ -2076,7 +2437,7 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi
                 } else {
                     IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out));
                     // same offsets
-                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, nextMemoryLayer.reserved_offset);
+                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0);
                 }
                 return;
             }
@@ -2119,6 +2480,13 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi
                                     });
                         if (included == concat_connection.end()) {
                             gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
+
+                            for (auto && inputLayer : concatLayerInfoItem.concatInputLayers) {
+                                if ( InferenceEngine::details::CaselessEq<std::string>()
+                                                                    (inputLayer.name, "input") ) {
+                                    bytes_alllocated_for_input[inputLayer.name] = ALIGN64(concatLayerInfoItem.reserved_size) - inputLayer.offset;
+                                }
+                            }
                         }
                         concatLayerInfo->second.output_allocation_flag = true;
                     }
@@ -2158,7 +2526,15 @@ intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) {
     return nullptr;
 }
 
-GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, size_t offset, int idx) {
+std::vector<void *>& GNAPlugin::get_ptr_inputs_global(std::string name) {
+    if (!ptr_inputs_global_id.count(name)) {
+        ptr_inputs_global_storage.push_front({});
+        ptr_inputs_global_id[name] = ptr_inputs_global_storage.begin();
+    }
+    return *ptr_inputs_global_id[name];
+}
+
+GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, int32_t offset, int idx) {
     // selecting particular input layers
     auto prevLayer = CNNNetPrevLayer(layer, idx);
 
@@ -2166,15 +2542,24 @@ GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *pt
 
     // real input not a memory input
     if (LayerInfo(prevLayer).isInput()) {
-        if (0 == bytes_alllocated_for_input) {
-            gnamem->push_value(&ptr_inputs_global.front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
-            bytes_alllocated_for_input = num_data_bytes_in;
+        if (0 == bytes_alllocated_for_input[prevLayer->name]) {
+            gnamem->push_value(&get_ptr_inputs_global(prevLayer->name).front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
+            bytes_alllocated_for_input[prevLayer->name] = num_data_bytes_in;
         }
-        if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input, 64)) {
-            THROW_IE_EXCEPTION << "Layer: " << layer->name << " Cannot bind pointer to already allocated input, due to size_allocated="
-                                  << bytes_alllocated_for_input << ", and size_requested=" << num_data_bytes_in;
+        if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input[prevLayer->name], 64)) {
+            THROW_GNA_EXCEPTION
+                << "Layer: " << layer->name
+                << " Cannot bind pointer to already allocated input(" << prevLayer->name
+                << "), due to size_allocated=" << bytes_alllocated_for_input[prevLayer->name]
+                << ", and size_requested=" << num_data_bytes_in;
         }
-        gnamem->bind_ptr(ptr, &ptr_inputs_global.front(), offset);
+
+        if (offset >= 0) {
+            gnamem->bind_ptr(ptr, &get_ptr_inputs_global(prevLayer->name).front(), offset);
+        } else {
+            gnamem->bind_ptr(&get_ptr_inputs_global(prevLayer->name).front(), ptr, -offset);
+        }
+
         return prevLayer;
     }
 
@@ -2213,7 +2598,7 @@ GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *pt
                                                     prevLayer->name);
         if (concatLayerInfo != concat_connection.end()) {
             auto & concatLayerInfoItem = concatLayerInfo->second;
-            // dnnLayer that is input for concat output layer
+            // dnnLayer that is input for concat layer
             gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
             // return layer over concat
             return CNNNetPrevLayer(prevLayer);