[CPU] Add explicit storage for MemoryNode (#895)
authorAlexander Peskov <alexander.peskov@intel.com>
Thu, 13 Aug 2020 16:06:20 +0000 (19:06 +0300)
committerGitHub <noreply@github.com>
Thu, 13 Aug 2020 16:06:20 +0000 (19:06 +0300)
26 files changed:
inference-engine/src/mkldnn_plugin/bf16transformer.cpp
inference-engine/src/mkldnn_plugin/bf16transformer.h
inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
inference-engine/tests/functional/inference_engine/transformations/convert_precision.cpp
inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp
inference-engine/tests/functional/plugin/cpu/bfloat16/memory_conv.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/execution_graph_tests/keep_assign.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/split_concat_memory.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/include/execution_graph_tests/keep_assing.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/include/subgraph_tests/split_concat_memory.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/execution_graph_tests/keep_assing.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/subgraph_tests/split_concat_memory.cpp [new file with mode: 0644]
inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp [new file with mode: 0644]
inference-engine/tests/ie_test_utils/common_test_utils/data_utils.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp
inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp
inference-engine/tests/unit/cpu/CMakeLists.txt
inference-engine/tests/unit/cpu/bf16_transformer_test.cpp [new file with mode: 0644]
inference-engine/tests_deprecated/functional/shared_tests/single_layer_tests/eltwise_tests.hpp
inference-engine/tests_deprecated/functional/shared_tests/single_layer_tests/quantize_tests.hpp
ngraph/core/src/op/read_value.cpp

index c9784c1..0d8ef1d 100644 (file)
@@ -55,8 +55,11 @@ void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
     InputsDataMap inputs = network.getInputsInfo();
     OutputsDataMap outputs = network.getOutputsInfo();
     for (auto iter : sortedLayers) {
-        if (_skipmarking.find(iter->type) != _skipmarking.end()) {
-            continue;
+        //  check, if memory output node needs to be transformed
+        if (iter->type == "Memory" && iter->outData.size() == 0 &&
+            iter->insData[0].lock()->getPrecision() == Precision::FP32) {
+            auto curPrec = iter->insData[0].lock()->getPrecision();
+            iter->insData[0].lock()->setPrecision(Precision::BF16);
         }
         for (size_t o = 0; o < iter->outData.size(); o++) {
             if (inputs.find(iter->outData[o]->getName()) == inputs.end()
@@ -66,7 +69,6 @@ void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
             }
         }
     }
-
     // convert all edges back to FP32 on demand
     optimizeToFloat(network);
 }
@@ -108,13 +110,9 @@ void BF16Transformer::optimizeToFloat(InferenceEngine::CNNNetwork &network) {
             toAnalyzeTensors.insert(output.second);
         }
     }
-
     // 2b. go over all unknown layers for this algo and mark them as fp32 and add to the toAnalyzeTensors
     // 2c. go over all inputs to _initbf16 and if they are fp32 - add them to the toAnalyzeTensors
     for (auto iter : sortedLayers) {
-        if (_skipmarking.find(iter->type) != _skipmarking.end()) {
-            continue;
-        }
         if (_initbf16.find(iter->type) == _initbf16.end()
             && _complementbf16.find(iter->type) == _complementbf16.end()
             && _multiinput.find(iter->type) == _multiinput.end()) {
@@ -156,7 +154,6 @@ void BF16Transformer::optimizeToFloat(InferenceEngine::CNNNetwork &network) {
             }
         }
     }
-
     // 3 - while toAnalyzeTensors is not empty look at the layers dealing with tensors mentioned in toAnalyzeTensors
     while (!toAnalyzeTensors.empty()) {
         DataPtr tensor = *toAnalyzeTensors.begin();
@@ -167,6 +164,10 @@ void BF16Transformer::optimizeToFloat(InferenceEngine::CNNNetwork &network) {
         if (_initbf16.find(layer->type) == _initbf16.end()) {
             // for all inputs investigate and modify tensor precision if required
             for (size_t i = 0; i < layer->insData.size(); i++) {
+                auto creator = getCreatorLayer(layer->insData[i].lock());
+                if (_skipmarking.find(creator.lock()->type) != _skipmarking.end()) {
+                    continue;
+                }
                 bool marked = tryToMarkFP32(layer->insData[i].lock(), immutable);
                 if (marked) {
                     toAnalyzeTensors.insert(layer->insData[i].lock());
@@ -183,6 +184,18 @@ void BF16Transformer::optimizeToFloat(InferenceEngine::CNNNetwork &network) {
         for (auto inputTo : getInputTo(tensor)) {
             for (size_t o = 0; o < inputTo.second->outData.size(); o++) {
                 if (inputTo.second->outData[o]->getTensorDesc().getPrecision() == Precision::BF16) {
+                    // if some layer (e.g. memory) consumes tensor, but must be fitted with another layer (e.g. memory output)
+                    // in the net, whe must prevent this tensor to be fp32 - marked
+                    bool notToMarkFP32 = false;
+                    for (auto consumer : getInputTo(inputTo.second->outData[o])) {
+                        if (_skipmarking.find(consumer.second->type) !=
+                            _skipmarking.end()) {
+                            notToMarkFP32 = true;
+                        }
+                    }
+                    if (notToMarkFP32) {
+                        continue;
+                    }
                     bool marked = tryToMarkFP32(inputTo.second->outData[o], immutable);
                     if (marked) {
                         toAnalyzeTensors.insert(layer->outData[o]);
index c725a57..370656e 100644 (file)
@@ -16,11 +16,12 @@ class BF16Transformer {
         { "convolution", "fullyconnected", "innerproduct" };
     const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
         { "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "logistic",
-          "exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather" };
+          "exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory" };
     const InferenceEngine::details::caseless_set<std::string> _multiinput =
         { "concat", "eltwise" };
+    //  prevent fallback to fp32 without considering both input and output nodes
     const InferenceEngine::details::caseless_set<std::string> _skipmarking =
-        { "const" };
+        { "memory" };
 
     /**
     * Tries to mark tensor as FP32 by analyzing of local consumers of the tensor. Do not mark if
index 1766ff5..a07f7bc 100644 (file)
@@ -11,6 +11,7 @@
 #include "mkldnn_infer_request.h"
 #include "mkldnn_memory_state.h"
 #include "mkldnn_itt.h"
+#include "nodes/mkldnn_memory_node.hpp"
 #include "bf16transformer.h"
 #include <legacy/ie_util_internal.hpp>
 #include <legacy/graph_tools.hpp>
@@ -153,7 +154,8 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
     if (_graphs.size() == 1) {
         for (auto &node : _graphs.begin()->get()->GetNodes()) {
             if (node->getType() == MemoryInput) {
-                auto state_store = node->getChildEdgeAt(0)->getMemoryPtr();
+                auto memoryNode = dynamic_cast<MKLDNNMemoryInputNode*>(node.get());
+                auto state_store = memoryNode->getStore();
                 auto state_name = node->getName();
 
                 // Remove suffix with pair ID. Internal information.
index 478a773..73306e8 100644 (file)
@@ -634,10 +634,6 @@ void MKLDNNGraph::AllocateWithReuse() {
             isConst  |= isConstOutput(edge);
             isOutput |= edge->getChild()->getType() == Output;
             isInput  |= edge->getParent()->getType() == Input;
-
-            // WA. MemoryOutput will keep data in that edge
-            // So need to make it immortal..
-            isConst |= edge->getParent()->getType() == MemoryInput;
         }
 
         if (reuse_io_tensors) {
index 936a963..75885fc 100644 (file)
@@ -46,6 +46,7 @@ std::shared_ptr<ICNNNetwork> dump_graph_as_ie_ngraph_net(const MKLDNNGraph &grap
 
     ngraph::ResultVector results;
     ngraph::ParameterVector params;
+    ngraph::NodeVector to_hold;
 
     auto get_inputs = [&] (const MKLDNNNodePtr & node) {
         auto pr_edges = node->getParentEdges();
@@ -67,7 +68,7 @@ std::shared_ptr<ICNNNetwork> dump_graph_as_ie_ngraph_net(const MKLDNNGraph &grap
     };
 
     auto create_ngraph_node = [&](const MKLDNNNodePtr &node) {
-        bool is_input = false, is_output = false;
+        bool is_input = false, is_output = false, should_be_hold = false;
         for (auto && kvp : graph.inputNodes) {
             if (kvp.second == node) {
                 is_input = true;
@@ -82,6 +83,12 @@ std::shared_ptr<ICNNNetwork> dump_graph_as_ie_ngraph_net(const MKLDNNGraph &grap
             }
         }
 
+        if (!is_output && node->getChildEdges().empty()) {
+            // The node has no consumer and is not an output.
+            // Should be hold in other irregular way.
+            should_be_hold = true;
+        }
+
         auto meta_data = extract_node_metadata(node);
         std::shared_ptr<ngraph::Node> return_node;
         if (is_input) {
@@ -106,6 +113,10 @@ std::shared_ptr<ICNNNetwork> dump_graph_as_ie_ngraph_net(const MKLDNNGraph &grap
             }
         }
 
+        if (should_be_hold) {
+            to_hold.push_back(return_node);
+        }
+
         for (auto && kvp : meta_data)
             return_node->get_rt_info()[kvp.first] = std::make_shared<::ngraph::VariantWrapper<std::string>>(kvp.second);
         return_node->set_friendly_name(node->getName());
@@ -120,6 +131,11 @@ std::shared_ptr<ICNNNetwork> dump_graph_as_ie_ngraph_net(const MKLDNNGraph &grap
         node2layer[node] = nodes.back();
     }
 
+    auto holder = results[0];
+    for (auto &node : to_hold) {
+        holder->add_control_dependency(node);
+    }
+
     ngraph::op::GenericIE::DisableReshape reshape(nodes);
     auto function = std::make_shared<ngraph::Function>(results, params, graph._name);
     InferenceEngine::CNNNetwork net(function);
index 045b5e1..12d225f 100644 (file)
@@ -441,6 +441,12 @@ std::string MKLDNNNode::getPrimitiveDescriptorType() {
             } else {
                 str_type += "_I8";
             }
+        } else {
+            if (selectedPrimitiveDesc->getConfig().outConfs[0].desc.getPrecision() != InferenceEngine::Precision::U8) {
+                str_type += "_" + std::string(selectedPrimitiveDesc->getConfig().outConfs[0].desc.getPrecision().name());
+            } else {
+                str_type += "_I8";
+            }
         }
     }
 
index 0b91af7..595729a 100644 (file)
@@ -31,49 +31,89 @@ void MKLDNNMemoryOutputNode::initSupportedPrimitiveDescriptors() {
         return;
 
     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
     InferenceEngine::LayerConfig config;
     config.dynBatchSupport = true;
     config.inConfs.resize(1);
     config.inConfs[0].inPlace = -1;
     config.inConfs[0].constant = false;
-    config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any);
+    config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, MKLDNNMemory::GetPlainFormat(getParentEdgeAt(0)->getDims()));
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, memory::format::any);
 }
 
-const MKLDNNEdgePtr MKLDNNMemoryOutputNode::getChildEdgeAt(size_t idx) const {
-    if (inputNode != nullptr) {
-        return inputNode->getChildEdgeAt(idx);
-    }
-    return MKLDNNNode::getChildEdgeAt(idx);
-}
-
 void MKLDNNMemoryOutputNode::execute(mkldnn::stream strm)  {
     auto& srcMemory = getParentEdgeAt(0)->getMemory();
 
-    const float *src_ptr = reinterpret_cast<const float*>(srcMemory.GetData()) +
-            srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
-            getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-    // TODO: this can be eliminated by completely removing MKLDNN memory output NODE, to fuse it with output of prev layer
-    memcpy(dst_ptr, src_ptr, srcMemory.GetSize());
+    auto inputMemoryNode = dynamic_cast<MKLDNNMemoryInputNode*>(inputNode);
+    IE_ASSERT(inputMemoryNode != nullptr);
+    inputMemoryNode->storeState(srcMemory);
 }
 
 #if defined (COMPILED_CPU_MKLDNN_INPUT_NODE)
 MKLDNNMemoryInputNode::MKLDNNMemoryInputNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
-        : MKLDNNInputNode(layer, eng, cache), MKLDNNMemoryNode(layer) {
+        : MKLDNNInputNode(layer, eng, cache), MKLDNNMemoryNode(layer), dataStore(new MKLDNNMemory{eng}) {
     if (created()) {
         holder = MKLDNNMemoryNodeVirtualEdge::registerInput(this);
     }
 }
 
+void MKLDNNMemoryInputNode::createPrimitive() {
+    MKLDNNInputNode::createPrimitive();
+
+    auto mem_desc = getChildEdgeAt(0)->getMemoryPtr()->GetDescriptor();
+    dataStore->Create(mem_desc);
+
+    // default memory state is zero filled
+    dataStore->FillZero();
+}
+
+/**
+ * Copy data from one tensor into other.
+ * As is. Assume that data is dense tensor with same layout.
+ * @param dst destination memory object
+ * @param src source memory object
+ */
+inline
+static void simple_copy(MKLDNNMemory& dst, const MKLDNNMemory& src) {
+    auto getDataWithOff = [] (const MKLDNNMemory& mem) {
+        auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(mem.GetDataType());
+        return static_cast<uint8_t*>(mem.GetData()) +
+                mem.GetDescriptor().data.layout_desc.blocking.offset_padding * elemSize;
+    };
+
+    auto srcPtr = getDataWithOff(src);
+    auto dstPtr = getDataWithOff(dst);
+    auto srcSizeInByte = src.GetSize();
+    auto dstSizeInByte = dst.GetSize();
+
+    IE_ASSERT(srcSizeInByte == dstSizeInByte) << "Memory objects are not compatible. Has different sizes.";
+
+    memcpy(dstPtr, srcPtr, srcSizeInByte);
+}
+
 MKLDNNMemoryInputNode::~MKLDNNMemoryInputNode() {
     MKLDNNMemoryNodeVirtualEdge::remove(this, holder);
 }
 
+MKLDNNMemoryPtr MKLDNNMemoryInputNode::getStore() {
+    return dataStore;
+}
+
+void MKLDNNMemoryInputNode::storeState(const MKLDNNMemory &new_state) {
+    // TODO: Should be next one call:
+    //           dataStore.SetData(new_state, false);
+    //       But because of performance reason we use simple manual copy
+    simple_copy(*dataStore, new_state);
+}
+
+void MKLDNNMemoryInputNode::execute(mkldnn::stream strm) {
+    auto dst_mem = getChildEdgeAt(0)->getMemory();
+    // TODO: Should be simple call of:
+    //           dst_mem.SetData(dataStore, false);
+    //       But because of performance reason we use simple manual copy
+    simple_copy(dst_mem, *dataStore);
+}
+
 MKLDNNMemoryNodeVirtualEdge::Holder* MKLDNNMemoryNodeVirtualEdge::registerInput(MKLDNNMemoryInputNode * node) {
     std::lock_guard<std::mutex> lock{MKLDNNMemoryNodeVirtualEdge::holderMutex};
     // in case of output already registered
index 588ea12..14a1c47 100644 (file)
@@ -69,7 +69,6 @@ class MKLDNNMemoryOutputNode : public MKLDNNNode, public MKLDNNMemoryNode {
     ~MKLDNNMemoryOutputNode() override;
     void getSupportedDescriptors() override;
     void initSupportedPrimitiveDescriptors() override;
-    const MKLDNNEdgePtr getChildEdgeAt(size_t idx) const override;
     void createPrimitive() override {}
     void execute(mkldnn::stream strm) override;
     bool created() const override {
@@ -79,6 +78,7 @@ class MKLDNNMemoryOutputNode : public MKLDNNNode, public MKLDNNMemoryNode {
     void setInputNode(MKLDNNNode* node) override {
         inputNode = node;
     }
+
  private:
     /**
      * @brief keeps reference to input sibling node
@@ -97,9 +97,15 @@ public:
     bool created() const override {
         return getType() == MemoryInput;
     }
+    void execute(mkldnn::stream strm) override;
+
+    void createPrimitive() override;
 
     void setInputNode(MKLDNNNode* node) override {}
+    void storeState(const MKLDNNMemory& mem);
+    MKLDNNMemoryPtr getStore();
  private:
+    MKLDNNMemoryPtr dataStore;
     static Register<MKLDNNMemoryInputNode> reg;
     MKLDNNMemoryNodeVirtualEdge::Holder* holder = nullptr;
 };
index 70b8a5a..45d3aa3 100644 (file)
@@ -287,4 +287,27 @@ TEST(TransformationTests, ConvertPrecision_TIBody) {
         ASSERT_FALSE(has_type<ngraph::element::Type_t::f16>(tensor_iterator->get_body()->to_function()));
         ASSERT_FALSE(has_type<ngraph::element::Type_t::i64>(tensor_iterator->get_body()->to_function()));
     }
+}
+
+TEST(TransformationTests, ConvertPrecision_Variables) {
+    std::shared_ptr<ngraph::Function> f(nullptr);
+    {
+        Shape shape {1, 10, 2};
+        auto inp = std::make_shared<opset4::Parameter>(element::f16, shape);
+        auto m_i = std::make_shared<opset4::Constant>(element::f16, shape, 1);
+        auto m_r = std::make_shared<opset4::ReadValue>(m_i, "ID");
+        auto sum = std::make_shared<opset4::Add>(inp, m_r);
+        auto m_w = std::make_shared<opset4::Assign>(sum, "ID");
+        auto mul = std::make_shared<opset4::Multiply>(inp, sum);
+
+        mul->add_control_dependency(m_w);
+
+        f = std::make_shared<Function>(NodeVector{mul}, ParameterVector{inp});
+
+        pass::Manager manager;
+        manager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::f16, ngraph::element::f32);
+        manager.run_passes(f);
+    }
+
+    ASSERT_FALSE(has_type<ngraph::element::Type_t::f16>(f));
 }
\ No newline at end of file
index fd67a7b..11d5eb1 100644 (file)
@@ -182,7 +182,7 @@ public:
         if (!InferenceEngine::with_cpu_x86_bfloat16()) {
             // on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
             // tests are useless on such platforms
-            return;
+            GTEST_SKIP();
         }
         std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
         InferenceEngine::CNNNetwork cnnNet(fnPtr);
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/memory_conv.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/memory_conv.cpp
new file mode 100644 (file)
index 0000000..a9352e6
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <fstream>
+
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "ie_system_conf.h"
+
+#include <ngraph/ngraph.hpp>
+#include <ngraph_ops/fully_connected.hpp>
+
+namespace LayerTestsDefinitions {
+
+using InferenceEngine::Precision;
+using InferenceEngine::SizeVector;
+
+class MemoryConv : public testing::WithParamInterface<LayerTestsUtils::basicParams>,
+                   public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<LayerTestsUtils::basicParams> obj) {
+        Precision netPrecision;
+        SizeVector inputShapes, newInputShapes;
+        std::string targetDevice;
+        std::tie(netPrecision, inputShapes, targetDevice) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice;
+        return result.str();
+    }
+
+protected:
+    void SetUp() {
+        SizeVector ie_shape;
+        std::tie(inPrc, ie_shape, targetDevice) = this->GetParam();
+
+        using namespace ngraph;
+        using std::make_shared;
+
+        Shape shape = ie_shape;
+        size_t C = shape[1];
+        element::Type type = ngraph::element::f32;
+
+        auto input = make_shared<op::v0::Parameter>(type, shape);
+        auto mem_i = make_shared<op::v0::Constant>(type, shape, 0);
+        auto mem_r = make_shared<op::v3::ReadValue>(mem_i, "id");
+
+        auto mul = make_shared<op::v0::Multiply>(mem_r, input);
+        auto sig = make_shared<op::v0::Sigmoid>(mul);
+
+        auto fc1_w = make_shared<op::v0::Constant>(type, Shape{C, C}, 1);
+        auto fc1_b = make_shared<op::v0::Constant>(type, Shape{C}, 1);
+        auto fc1 = make_shared<op::FullyConnected>(sig, fc1_w, fc1_b, shape);
+
+        auto fc2_w = make_shared<op::v0::Constant>(type, Shape{C, C}, 1);
+        auto fc2_b = make_shared<op::v0::Constant>(type, Shape{C}, 1);
+        auto fc2 = make_shared<op::FullyConnected>(fc1, fc2_w, fc2_b, shape);
+
+        auto mem_w = make_shared<op::v3::Assign>(fc1, "id");
+
+        // WA. Limitation of ngraph. control_dependency are required.
+        mem_w->add_control_dependency(mem_r);
+        fc2->add_control_dependency(mem_w);
+
+        function = std::make_shared<ngraph::Function>(
+                ngraph::NodeVector      {fc2},
+                ngraph::ParameterVector {input},
+                "SimpleNet");
+    }
+};
+
+TEST_P(MemoryConv, CheckTypeConversion) {
+    if (!InferenceEngine::with_cpu_x86_bfloat16())
+        GTEST_SKIP();
+
+    auto ie = PluginCache::get().ie();
+    auto net = InferenceEngine::CNNNetwork(function);
+    auto exe_net = ie->LoadNetwork(net, "CPU");
+    auto inf_reg = exe_net.CreateInferRequest();
+
+    // check data type via exec graph
+    auto exec_graph = exe_net.GetExecGraphInfo();
+    auto exec_ops = exec_graph.getFunction()->get_ops();
+    std::shared_ptr<ngraph::Node> mem_r, mem_w;
+
+    for (auto &node : exec_ops) {
+        auto var = node->get_rt_info()["layerType"];
+        auto s_val = std::dynamic_pointer_cast<ngraph::VariantImpl<std::string>>(var);
+        if (s_val->get() == "MemoryOutput")
+            mem_w = node;
+        if (s_val->get() == "MemoryInput")
+            mem_r = node;
+    }
+
+    ASSERT_NE(nullptr, mem_r);
+    ASSERT_EQ(ngraph::element::bf16, mem_r->output(0).get_element_type());
+
+    ASSERT_NE(nullptr, mem_w);
+    ASSERT_EQ(ngraph::element::bf16, mem_w->input(0).get_element_type());
+}
+
+INSTANTIATE_TEST_CASE_P(CPU, MemoryConv,
+                        ::testing::Combine(
+                                ::testing::Values<Precision>(Precision::BF16, Precision::FP32),
+                                ::testing::Values(SizeVector{1, 200}),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        MemoryConv::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/execution_graph_tests/keep_assign.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/execution_graph_tests/keep_assign.cpp
new file mode 100644 (file)
index 0000000..8876248
--- /dev/null
@@ -0,0 +1,16 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "execution_graph_tests/keep_assing.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+INSTANTIATE_TEST_CASE_P(KeepAssign, ExecGraphKeepAssignNode,
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ExecGraphKeepAssignNode::getTestCaseName);
+
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/split_concat_memory.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/split_concat_memory.cpp
new file mode 100644 (file)
index 0000000..99a11e7
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "subgraph_tests/split_concat_memory.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::I16,
+        InferenceEngine::Precision::U8,
+        InferenceEngine::Precision::I8,
+};
+
+const std::vector<InferenceEngine::SizeVector> shapes = {
+    {1, 8, 3, 2},
+    {3, 8, 3, 2},
+    {3, 8, 3},
+    {3, 8},
+};
+
+INSTANTIATE_TEST_CASE_P(CPU, SplitConcatMemory,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(shapes),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(1),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        SplitConcatMemory::getTestCaseName);
+}  // namespace
+
+
+
+
diff --git a/inference-engine/tests/functional/plugin/shared/include/execution_graph_tests/keep_assing.hpp b/inference-engine/tests/functional/plugin/shared/include/execution_graph_tests/keep_assing.hpp
new file mode 100644 (file)
index 0000000..24ee147
--- /dev/null
@@ -0,0 +1,14 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+namespace LayerTestsDefinitions {
+
+class ExecGraphKeepAssignNode : public testing::TestWithParam<std::string> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<std::string> obj);
+};
+
+}  // namespace LayerTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/split_concat_memory.hpp b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/split_concat_memory.hpp
new file mode 100644 (file)
index 0000000..64e010c
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+using SplitConcatMemoryParamsTuple = typename std::tuple<
+    std::vector<size_t>,         // input shapes
+    InferenceEngine::Precision,  // precision
+    int,                         // axis of split
+    std::string                  // device name
+>;
+
+
+class SplitConcatMemory : public testing::WithParamInterface<SplitConcatMemoryParamsTuple>,
+                          public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ParamType> obj);
+
+protected:
+    void SetUp() override;
+
+    int axis;
+};
+
+}  // namespace LayerTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/src/execution_graph_tests/keep_assing.cpp b/inference-engine/tests/functional/plugin/shared/src/execution_graph_tests/keep_assing.cpp
new file mode 100644 (file)
index 0000000..2a1e752
--- /dev/null
@@ -0,0 +1,66 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "execution_graph_tests/keep_assing.hpp"
+
+#include <ngraph/ngraph.hpp>
+#include <inference_engine.hpp>
+
+namespace LayerTestsDefinitions {
+
+std::string ExecGraphKeepAssignNode::getTestCaseName(testing::TestParamInfo<std::string> obj) {
+    std::string targetDevice = obj.param;
+    return "Dev=" + targetDevice;
+}
+
+/**
+ * Assign/MemoryOutput operation node may hanging in air (leaf, has no consumer).
+ * So exec graph may lose it. Will check that it's present in dumped exec graph.
+ */
+TEST_P(ExecGraphKeepAssignNode, KeepAssignNode) {
+    auto device_name = this->GetParam();
+    ngraph::Shape shape = {3, 2};
+    ngraph::element::Type type = ngraph::element::f32;
+
+    using std::make_shared;
+    using namespace ngraph::op;
+
+    // Some simple graph with Memory(Assign) node            //    in   read     //
+    auto input = make_shared<Parameter>(type, shape);        //    | \  /        //
+    auto mem_i = make_shared<Constant>(type, shape, 0);      //    |  mul        //
+    auto mem_r = make_shared<ReadValue>(mem_i, "id");        //    | /  \        //
+    auto mul   = make_shared<Multiply>(mem_r, input);        //    sum  assign   //
+    auto mem_w = make_shared<Assign>(mul, "id");             //     |            //
+    auto sum   = make_shared<Add>(mul, input);               //    out           //
+
+    mem_w->add_control_dependency(mem_r);
+    sum->add_control_dependency(mem_w);
+
+    auto function = std::make_shared<ngraph::Function>(
+            ngraph::NodeVector      {sum},
+            ngraph::ParameterVector {input},
+            "SimpleNet");
+
+    // Load into plugin and get exec graph
+    auto ie  = InferenceEngine::Core();
+    auto net = InferenceEngine::CNNNetwork(function);
+    auto exec_net   = ie.LoadNetwork(net, device_name);
+    auto exec_graph = exec_net.GetExecGraphInfo();
+    auto exec_ops   = exec_graph.getFunction()->get_ops();
+
+    // Check Memory(Assign) node existence
+    bool assign_node_found;
+    for (auto &node : exec_ops) {
+        auto var = node->get_rt_info()["layerType"];
+        auto s_val = std::dynamic_pointer_cast<ngraph::VariantImpl<std::string>>(var);
+
+        if (s_val->get() == "MemoryOutput") {
+            assign_node_found = true;
+            break;
+        }
+    }
+    ASSERT_TRUE(assign_node_found);
+}
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/split_concat_memory.cpp b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/split_concat_memory.cpp
new file mode 100644 (file)
index 0000000..2643154
--- /dev/null
@@ -0,0 +1,136 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_tests/split_concat_memory.hpp"
+#include "common_test_utils/xml_net_builder/ir_net.hpp"
+
+namespace LayerTestsDefinitions {
+
+using namespace CommonTestUtils;
+using namespace InferenceEngine;
+
+std::string SplitConcatMemory::getTestCaseName(testing::TestParamInfo<ParamType> obj) {
+    InferenceEngine::Precision netPrecision;
+    InferenceEngine::SizeVector inputShapes;
+    int axis;
+    std::string targetDevice;
+    std::tie(inputShapes, netPrecision, axis, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+    result << "PRC=" << netPrecision.name() << "_";
+    result << "axis=" << axis << "_";
+    result << "dev=" << targetDevice;
+    return result.str();
+}
+
+void SplitConcatMemory::SetUp() {
+    SizeVector shape;
+    std::tie(shape, inPrc, axis, targetDevice) = this->GetParam();
+
+    auto shape_14 = shape;
+    shape_14[axis] /= 4;
+    auto shape_34 = shape;
+    shape_34[axis] -= shape_14[axis];
+
+    /*
+     *    Cyclic buffer length of 4
+     *        ______   ______
+     *       [_mem1_] [_inp1_]
+     *          _|______|_
+     *         [_cocncat__]
+     *         _____|______
+     *      __|____     ___|__
+     *     [_plus1_]   [_spl1_]
+     *        |         |    |
+     *      __|___         __|___
+     *     [_out1_]       [_mem2_]
+     */
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrc);
+    ngraph::Shape ng_share_14(shape_14);
+    ngraph::Shape ng_share_34(shape_34);
+
+    auto input = std::make_shared<ngraph::op::Parameter>(ngPrc, ng_share_14);
+    input->set_friendly_name("input");
+
+    auto mem_c = std::make_shared<ngraph::op::Constant>(ngPrc, ng_share_34, 0);
+    auto mem_r = std::make_shared<ngraph::op::ReadValue>(mem_c, "id");
+    auto cnc = std::make_shared<ngraph::op::Concat>(ngraph::NodeVector{mem_r, input}, axis);
+
+    std::vector<int64_t> chunks_val {static_cast<int64_t>(ng_share_14[axis]), static_cast<int64_t>(ng_share_34[axis])};
+    auto chunk_c = std::make_shared<ngraph::op::Constant>(::ngraph::element::i64, ngraph::Shape{chunks_val.size()}, chunks_val);
+    auto axis_c = std::make_shared<ngraph::op::Constant>(::ngraph::element::i64, ngraph::Shape{}, axis);
+    auto spl = std::make_shared<ngraph::op::v1::VariadicSplit>(cnc, axis_c, chunk_c);
+
+    auto one = std::make_shared<ngraph::op::Constant>(ngPrc, ngraph::Shape{}, 1);
+    auto plus = std::make_shared<ngraph::op::Add>(cnc, one, ngraph::op::AutoBroadcastSpec::NUMPY);
+    plus->set_friendly_name("plus_one");
+
+    auto mem_w = std::make_shared<ngraph::op::Assign>(spl->output(1), "id");
+
+    // WA. Ngraph limitations. Assign should have control dependencies on read.
+    // And someone should hold assign node.
+    mem_w->add_control_dependency(mem_r);
+    plus->add_control_dependency(mem_w);
+
+    function = std::make_shared<ngraph::Function>(
+            ngraph::NodeVector      {plus},
+            ngraph::ParameterVector {input},
+            "CyclicBuffer4");
+}
+
+TEST_P(SplitConcatMemory, cyclicBufferCorrectness) {
+    auto ie = PluginCache::get().ie();
+    cnnNetwork = InferenceEngine::CNNNetwork{function};
+
+    auto exe_net = ie->LoadNetwork(cnnNetwork, "CPU");
+    auto inf_reg = exe_net.CreateInferRequest();
+
+    /*
+     * cnc1 out  |  mem      | In|q
+     *           |===============|
+     * iter_1    | 0 | 0 | 0 | 1 |
+     * iter_2    | 0 | 0 | 1 | 2 |
+     * iter 3    | 0 | 1 | 2 | 3 |
+     */
+
+    auto i_blob = inf_reg.GetBlob("input");
+    auto o_blob = inf_reg.GetBlob("plus_one");
+
+    auto o_blob_ref = make_blob_with_precision(o_blob->getTensorDesc());
+    o_blob_ref->allocate();
+
+    auto fill_by_quarter = [this] (Blob::Ptr& blob, std::vector<float> vals) {
+        IE_ASSERT(vals.size() == 4);
+        auto quarter_blocked_shape = blob->getTensorDesc().getDims();
+
+        // splis axis dimension into chunk
+        IE_ASSERT(quarter_blocked_shape[axis] % vals.size() == 0);
+        quarter_blocked_shape[axis] /= vals.size();
+        quarter_blocked_shape.insert(quarter_blocked_shape.begin() + axis, vals.size());
+
+        auto quarter_blocked_view = make_reshape_view(blob, quarter_blocked_shape);
+        fill_data_with_broadcast(quarter_blocked_view, axis, vals);
+    };
+
+    // iteration 1
+    fill_data_const(i_blob, 1);
+    fill_by_quarter(o_blob_ref, {1, 1, 1, 2});
+    inf_reg.Infer();
+    Compare(o_blob_ref, o_blob);
+
+    // iteration 2
+    fill_data_const(i_blob, 2);
+    fill_by_quarter(o_blob_ref, {1, 1, 2, 3});
+    inf_reg.Infer();
+    Compare(o_blob_ref, o_blob);
+
+    // iteration 3
+    fill_data_const(i_blob, 3);
+    fill_by_quarter(o_blob_ref, {1, 2, 3, 4});
+    inf_reg.Infer();
+    Compare(o_blob_ref, o_blob);
+}
+
+}  // namespace LayerTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp b/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.cpp
new file mode 100644 (file)
index 0000000..a878de1
--- /dev/null
@@ -0,0 +1,202 @@
+// Copyright (C) 2019-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+
+#include <debug.h>  // to allow putting vector into exception string stream
+#include <details/ie_exception.hpp>
+
+#include <ie_blob.h>
+#include <blob_factory.hpp>
+
+namespace CommonTestUtils {
+
+bool isDenseBlob(const InferenceEngine::Blob::Ptr& blob) {
+    auto blk_desc = blob->getTensorDesc().getBlockingDesc();
+    auto dims = blk_desc.getBlockDims();
+    auto strs = blk_desc.getStrides();
+
+    IE_ASSERT(dims.size() == strs.size()) << " isDenseBlob: inconsistent tensor descriptor";
+
+    auto size = dims.size();
+    if (size == 0) return true;
+    if (size == 1) return strs[0] == 1;
+
+    for (auto i = size - 1; i > 0; i--) {
+        if (strs[i - 1] != strs[i - 1] * dims[i])
+            return false;
+    }
+
+    return true;
+}
+
+template<typename T>
+void copy_7D(void *src_raw_ptr, std::vector<size_t> &src_str, void *dst_raw_ptr, std::vector<size_t> &dst_str, std::vector<size_t> &dims) {
+    auto src_ptr = static_cast<T*>(src_raw_ptr);
+    auto dst_ptr = static_cast<T*>(dst_raw_ptr);
+
+    for (size_t d0 = 0; d0 < dims[0]; d0++) { auto src_ptr_0 = src_ptr   + src_str[0]*d0; auto dst_ptr_0 = dst_ptr +   dst_str[0]*d0;
+    for (size_t d1 = 0; d1 < dims[1]; d1++) { auto src_ptr_1 = src_ptr_0 + src_str[1]*d1; auto dst_ptr_1 = dst_ptr_0 + dst_str[1]*d1;
+    for (size_t d2 = 0; d2 < dims[2]; d2++) { auto src_ptr_2 = src_ptr_1 + src_str[2]*d2; auto dst_ptr_2 = dst_ptr_1 + dst_str[2]*d2;
+    for (size_t d3 = 0; d3 < dims[3]; d3++) { auto src_ptr_3 = src_ptr_2 + src_str[3]*d3; auto dst_ptr_3 = dst_ptr_2 + dst_str[3]*d3;
+    for (size_t d4 = 0; d4 < dims[4]; d4++) { auto src_ptr_4 = src_ptr_3 + src_str[4]*d4; auto dst_ptr_4 = dst_ptr_3 + dst_str[4]*d4;
+    for (size_t d5 = 0; d5 < dims[5]; d5++) { auto src_ptr_5 = src_ptr_4 + src_str[5]*d5; auto dst_ptr_5 = dst_ptr_4 + dst_str[5]*d5;
+    for (size_t d6 = 0; d6 < dims[6]; d6++) { auto src_ptr_6 = src_ptr_5 + src_str[6]*d6; auto dst_ptr_6 = dst_ptr_5 + dst_str[6]*d6;
+        *dst_ptr_6 = *src_ptr_6;
+    }}}}}}}
+}
+
+void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, InferenceEngine::Blob::Ptr& values) {
+    using InferenceEngine::SizeVector;
+    constexpr size_t MAX_N_DIMS = 7;  // Suppose it's enough
+
+    IE_ASSERT(blob->getTensorDesc().getPrecision() == values->getTensorDesc().getPrecision());
+
+    auto values_dims = values->getTensorDesc().getDims();
+    auto blob_dims = blob->getTensorDesc().getDims();
+    auto n_dims = blob_dims.size();
+    IE_ASSERT(values_dims.size() <= n_dims);
+    IE_ASSERT(n_dims <= MAX_N_DIMS);
+
+    SizeVector src_dims(MAX_N_DIMS, 1);
+    std::copy(values_dims.rbegin(), values_dims.rend(), src_dims.rbegin());
+
+    SizeVector dst_dims(MAX_N_DIMS, 1);
+    std::copy(blob_dims.rbegin(), blob_dims.rend(), dst_dims.rbegin());
+
+    bool compatible = true;
+    for (int i = 0; i < MAX_N_DIMS; i++) {
+        if (src_dims[i] != dst_dims[i] && src_dims[i] != 1)
+            compatible = false;
+    }
+    IE_ASSERT(compatible) << "fill_data_with_broadcast error: Tensor shape " << values_dims
+                          << " can not be broadcasted to shape " << blob_dims;
+
+    auto fill_strides_like_plain = [] (SizeVector dims) {
+        SizeVector str(dims.size());
+        if (str.empty())
+            return str;
+        else
+            str.back() = 1;
+
+        // stride[i] = stride[i+1]*d[i+1]
+        std::transform(dims.rbegin(), dims.rend() - 1, str.rbegin(), str.rbegin() + 1,
+                       [] (size_t d, size_t s) { return d * s; });
+
+        // zeroing broadcast dimension equal 1
+        std::transform(str.begin(), str.end(), dims.begin(), str.begin(),
+                       [] (size_t s, size_t d) { return d == 1 ? 0 : s; });
+
+        return str;
+    };
+
+    SizeVector src_strides = fill_strides_like_plain(src_dims);
+    SizeVector dst_strides = fill_strides_like_plain(dst_dims);
+
+    auto get_data = [] (InferenceEngine::Blob::Ptr &blob) {
+        auto mem_blob = dynamic_cast<InferenceEngine::MemoryBlob*>(blob.get());
+        auto mem = mem_blob->rwmap();
+        return mem.as<float*>();
+    };
+
+    auto dst_ptr = get_data(blob);
+    auto src_ptr = get_data(values);
+
+    switch (blob->getTensorDesc().getPrecision()) {
+        case InferenceEngine::Precision::FP32:
+        case InferenceEngine::Precision::I32:
+            copy_7D<uint32_t>(src_ptr, src_strides, dst_ptr, dst_strides, dst_dims);
+            break;
+        case InferenceEngine::Precision::I16:
+        case InferenceEngine::Precision::U16:
+        case InferenceEngine::Precision::FP16:
+        case InferenceEngine::Precision::BF16:
+            copy_7D<uint16_t>(src_ptr, src_strides, dst_ptr, dst_strides, dst_dims);
+            break;
+        case InferenceEngine::Precision::U8:
+        case InferenceEngine::Precision::I8:
+            copy_7D<uint8_t>(src_ptr, src_strides, dst_ptr, dst_strides, dst_dims);
+            break;
+        default:
+            THROW_IE_EXCEPTION << "Unsupported precision by fill_data_with_broadcast function";
+    }
+}
+
+void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, size_t axis, std::vector<float> values) {
+    InferenceEngine::SizeVector value_dims(blob->getTensorDesc().getDims().size() - axis, 1);
+    value_dims.front() = values.size();
+    auto prc = blob->getTensorDesc().getPrecision();
+    auto layout = InferenceEngine::TensorDesc::getLayoutByDims(value_dims);
+    InferenceEngine::TensorDesc value_tdesc(prc, value_dims, layout);
+
+    auto values_blob = make_blob_with_precision(value_tdesc, values.data());
+    fill_data_with_broadcast(blob, values_blob);
+}
+
+InferenceEngine::Blob::Ptr make_reshape_view(const InferenceEngine::Blob::Ptr &blob, InferenceEngine::SizeVector new_shape) {
+    using InferenceEngine::TensorDesc;
+    auto new_size = std::accumulate(new_shape.begin(), new_shape.end(), 1, std::multiplies<size_t>());
+    IE_ASSERT(new_size == blob->size());
+
+    auto orig_mem_blob = dynamic_cast<InferenceEngine::MemoryBlob*>(blob.get());
+    auto orig_mem = orig_mem_blob->rwmap();
+    auto orig_ptr = orig_mem.as<float*>();
+
+    auto new_tdesc = TensorDesc(blob->getTensorDesc().getPrecision(), new_shape, TensorDesc::getLayoutByDims(new_shape));
+    auto new_blob = make_blob_with_precision(new_tdesc, orig_ptr);
+    return new_blob;
+}
+
+/**
+ * repeated filling tensor with data.
+ *
+ * @tparam PRC
+ * @param data
+ * @param size
+ * @param values
+ */
+template<InferenceEngine::Precision::ePrecision PRC = InferenceEngine::Precision::FP32>
+static void fill_data_const(void *data, size_t size, const std::vector<float> &values) {
+    auto t_data = static_cast<typename InferenceEngine::PrecisionTrait<PRC>::value_type *>(data);
+    auto val_size = values.size();
+    for (size_t i = 0, j = 0; i < size; i++) {
+        t_data[i] = values[j++];
+        if (j == val_size) j = 0;
+    }
+}
+
+void fill_data_const(InferenceEngine::Blob::Ptr& blob, const std::vector<float> &val) {
+    auto prc = blob->getTensorDesc().getPrecision();
+    auto raw_data_ptr = blob->buffer().as<void*>();
+    auto raw_data_size = blob->size();
+
+    using InferenceEngine::Precision;
+    switch (prc) {
+        case Precision::FP32:
+            fill_data_const<Precision::FP32>(raw_data_ptr, raw_data_size, val);
+            break;
+        case Precision::I32:
+            fill_data_const<Precision::I32>(raw_data_ptr, raw_data_size, val);
+            break;
+        case Precision::U8:
+            fill_data_const<Precision::U8>(raw_data_ptr, raw_data_size, val);
+            break;
+        case Precision::I8:
+            fill_data_const<Precision::I8>(raw_data_ptr, raw_data_size, val);
+            break;
+        case Precision::U16:
+            fill_data_const<Precision::U16>(raw_data_ptr, raw_data_size, val);
+            break;
+        case Precision::I16:
+            fill_data_const<Precision::I16>(raw_data_ptr, raw_data_size, val);
+            break;
+        default:
+            THROW_IE_EXCEPTION << "Unsupported precision by fill_data_const() function";
+    }
+}
+
+void fill_data_const(InferenceEngine::Blob::Ptr& blob, float val) {
+    fill_data_const(blob, std::vector<float> {val});
+}
+}  // namespace CommonTestUtils
index a5b1efd..1cd4d53 100644 (file)
@@ -10,6 +10,7 @@
 #include <ngraph/type/float16.hpp>
 
 #include <ie_blob.h>
+#include <blob_factory.hpp>
 #include <random>
 
 namespace CommonTestUtils {
@@ -30,15 +31,46 @@ static void fill_data_sine(float *data, size_t size, float center, float ampl, f
     }
 }
 
-static void fill_data_const(float *data, size_t size, float value) {
-    for (size_t i = 0; i < size; i++) {
-        data[i] = value;
-    }
-}
+/**
+ * Fill blob with value data blob. Broadcast semantic is included.
+ * Broadcasting with alignment through last dimension.
+ *
+ * @param blob tensor to fill in
+ * @param values src tensor which should be broadcast
+ */
+void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, InferenceEngine::Blob::Ptr& values);
 
-static void fill_data_const(InferenceEngine::Blob::Ptr& blob, float val) {
-    fill_data_const(blob->buffer().as<float*>(), blob->size(), val);
-}
+/**
+ * Wrapper on top of fill_data_with_broadcast with simplified signature
+ *
+ * @param blob the destination blob to fill in
+ * @param axis Axis to apply values
+ * @param values data to broadcast
+ */
+void fill_data_with_broadcast(InferenceEngine::Blob::Ptr& blob, size_t axis, std::vector<float> values);
+
+/**
+ * Make a view blob with new shape. It will reinterpret original tensor data as a tensor with new shape.
+ *
+ * NB! Limitation: the nwe one blob will no have ownership of data buffer. The original blob should be alive
+ *     while view is in use.
+ *
+ * @param blob original source tensor
+ * @param new_shape new one shape for view blob
+ * @return new one blob view
+ */
+InferenceEngine::Blob::Ptr make_reshape_view(const InferenceEngine::Blob::Ptr &blob, InferenceEngine::SizeVector new_shape);
+
+/**
+ * Fill blob with single value for all elements
+ *
+ * like:
+ *     fill_data_with_broadcast(blob, 0, {val});
+ *
+ * @param blob tensor to fill in
+ * @param val value to set into each element
+ */
+void fill_data_const(InferenceEngine::Blob::Ptr& blob, float val);
 
 static void fill_data_bbox(float *data, size_t size, int height, int width, float omega) {
     float center_h = (height - 1.0f) / 2;
@@ -171,7 +203,6 @@ void inline fill_data_random<InferenceEngine::Precision::FP32>(InferenceEngine::
     fill_data_random_float<InferenceEngine::Precision::FP32>(blob, range, start_from, k);
 }
 
-
 template<>
 void inline fill_data_random<InferenceEngine::Precision::FP16>(InferenceEngine::Blob::Ptr &blob, const uint32_t range, int32_t start_from, const int32_t k) {
     fill_data_random_float<InferenceEngine::Precision::FP16>(blob, range, start_from, k);
index d0a1262..0708818 100644 (file)
@@ -64,6 +64,32 @@ void LayerTestsCommon::Compare(const std::vector<std::uint8_t> &expected, const
     }
 }
 
+void LayerTestsCommon::Compare(const InferenceEngine::Blob::Ptr &expected, const InferenceEngine::Blob::Ptr &actual) {
+    auto get_raw_buffer = [] (const InferenceEngine::Blob::Ptr &blob) {
+        auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+        IE_ASSERT(memory);
+        const auto lockedMemory = memory->wmap();
+        return lockedMemory.as<const std::uint8_t *>();
+    };
+    const auto expectedBuffer = get_raw_buffer(expected);
+    const auto actualBuffer = get_raw_buffer(actual);
+
+    const auto &precision = actual->getTensorDesc().getPrecision();
+    const auto &size = actual->size();
+    switch (precision) {
+        case InferenceEngine::Precision::FP32:
+            Compare(reinterpret_cast<const float *>(expectedBuffer), reinterpret_cast<const float *>(actualBuffer),
+                    size, threshold);
+            break;
+        case InferenceEngine::Precision::I32:
+            Compare(reinterpret_cast<const std::int32_t *>(expectedBuffer),
+                    reinterpret_cast<const std::int32_t *>(actualBuffer), size, 0);
+            break;
+        default:
+            FAIL() << "Comparator for " << precision << " precision isn't supported";
+    }
+}
+
 void LayerTestsCommon::ConfigurePlugin() {
     if (!configuration.empty()) {
         core->SetConfig(configuration, targetDevice);
index fb5448a..ce86d00 100644 (file)
@@ -56,6 +56,8 @@ public:
 
     virtual void Compare(const std::vector<std::uint8_t> &expected, const InferenceEngine::Blob::Ptr &actual);
 
+    virtual void Compare(const InferenceEngine::Blob::Ptr &expected, const InferenceEngine::Blob::Ptr &actual);
+
     virtual void SetRefMode(RefMode mode);
 
 protected:
index 9ec5ad0..d7acf89 100644 (file)
@@ -14,11 +14,13 @@ addIeTargetTest(
         ROOT ${CMAKE_CURRENT_SOURCE_DIR}
         INCLUDES
             ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin
+            ${IE_MAIN_SOURCE_DIR}/src/transformations/include
         OBJECT_FILES
             ${MKLDNN_SRC_OBJ}
         LINK_LIBRARIES
             unitTestUtils
             mkldnn
+            inference_engine_transformations
         ADD_CPPLINT
         LABELS
             CPU
diff --git a/inference-engine/tests/unit/cpu/bf16_transformer_test.cpp b/inference-engine/tests/unit/cpu/bf16_transformer_test.cpp
new file mode 100644 (file)
index 0000000..d96a843
--- /dev/null
@@ -0,0 +1,170 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+#include <gtest/gtest.h>
+
+#include <ngraph/ngraph.hpp>
+#include <ngraph_ops/fully_connected.hpp>
+
+#include <inference_engine.hpp>
+#include <details/ie_cnn_network_tools.h>
+#include <convert_function_to_cnn_network.hpp>
+#include <bf16transformer.h>
+
+using ngraph::Shape;
+using ngraph::element::Type;
+using namespace ngraph::op;
+using std::make_shared;
+using InferenceEngine::Precision;
+
+std::map<std::string, InferenceEngine::CNNLayerPtr> get_layer_collection(InferenceEngine::CNNNetwork net) {
+    IE_SUPPRESS_DEPRECATED_START
+    auto all_layers = InferenceEngine::details::CNNNetSortTopologically(net);
+
+    std::map<std::string, InferenceEngine::CNNLayerPtr> res;
+    for (auto &layer : all_layers) {
+        res[layer->name] = layer;
+    }
+    IE_SUPPRESS_DEPRECATED_END
+    return res;
+}
+
+enum TypeOfNet { NG, IE };
+InferenceEngine::CNNNetwork create_net(std::shared_ptr<ngraph::Function> &func, TypeOfNet type) {
+    InferenceEngine::CNNNetwork ng_net(func);
+    if (type == NG)
+        return ng_net;
+    else
+        return InferenceEngine::CNNNetwork {InferenceEngine::details::convertFunctionToICNNNetwork(func, ng_net)};
+}
+
+
+TEST(BF16TransformerTest, KeepMemoryPrecision) {
+    /*
+     *  Suggested pattern
+     *     _______   _____
+     *    [_mem_r_] [_inp_]
+     *        _|______|_
+     *       [___mul____]
+     *          __|__
+     *         [_sig_]
+     *          __|__
+     *         [_fc1_]
+     *         ___|____
+     *     ___|___   __|__
+     *    [_mem_w_] [_fc2_]
+     *               __|__
+     *              [_out_]
+     *
+     *  If does'n care about memory precision the mem_w will have precicion of data
+     *  between fc1 and fc2 operations. In case of enabled BF16 it should be BF16.
+     *  However mem_r still keep original precision.
+     */
+    Shape shape = {3, 2};
+    Type type = ngraph::element::f32;
+    auto input = make_shared<Parameter>(type, shape);
+    auto mem_i = make_shared<Constant>(type, shape, 0);
+    auto mem_r = make_shared<ReadValue>(mem_i, "id");
+    mem_r->set_friendly_name("mem_r");
+
+    auto mul = make_shared<Multiply>(mem_r, input);
+    auto sig = make_shared<Sigmoid>(mul);
+
+    auto fc1_w = make_shared<Constant>(type, Shape{2, 2}, 1);
+    auto fc1_b = make_shared<Constant>(type, Shape{2}, 1);
+    auto fc1 = make_shared<FullyConnected>(sig, fc1_w, fc1_b, shape);
+
+    auto fc2_w = make_shared<Constant>(type, Shape{2, 2}, 1);
+    auto fc2_b = make_shared<Constant>(type, Shape{2}, 1);
+    auto fc2 = make_shared<FullyConnected>(fc1, fc2_w, fc2_b, shape);
+
+    auto mem_w = make_shared<Assign>(fc1, "id");
+    mem_w->set_friendly_name("mem_w");
+
+    // WA. Limitation of ngraph. control_dependency are required.
+    mem_w->add_control_dependency(mem_r);
+    fc2->add_control_dependency(mem_w);
+
+    auto function = make_shared<ngraph::Function>(
+            ngraph::NodeVector      {fc2},
+            ngraph::ParameterVector {input});
+
+    auto net = create_net(function, IE);
+
+    // Apply tested BF16 transformation
+    MKLDNNPlugin::BF16Transformer transformer;
+    transformer.convertToBFloat16(net);
+
+    // Check precision
+    auto layers = get_layer_collection(net);
+    IE_SUPPRESS_DEPRECATED_START
+    Precision prc_mem_r = layers["mem_r"]->outData[0]->getPrecision();
+    Precision prc_mem_w = layers["mem_w"]->insData[0].lock()->getPrecision();
+    IE_SUPPRESS_DEPRECATED_END
+
+    ASSERT_EQ(prc_mem_r, Precision::BF16);
+    ASSERT_EQ(prc_mem_w, Precision::BF16);
+}
+
+TEST(BF16TransformerTest, DISABLED_KeepMemoryPrecisionWithGEMM) {
+    /*     _______   _____
+     *    [_mem_r_] [_inp_]
+     *        _|______|_
+     *       [___mul____]
+     *          __|__
+     *         [_sig_]
+     *          __|____
+     *         [_gemm1_]
+     *         ___|____
+     *     ___|___   __|____
+     *    [_mem_w_] [_gemm2_]
+     *               __|__
+     *              [_out_]
+     *
+     *  Same as KeepMemoryPrecision test with replacing FC -> GEMM
+     */
+    Shape shape = {3, 2};
+    Type type = ngraph::element::f32;
+    auto input = make_shared<Parameter>(type, shape);
+    auto mem_i = make_shared<Constant>(type, shape, 0);
+    auto mem_r = make_shared<ReadValue>(mem_i, "id");
+    mem_r->set_friendly_name("mem_r");
+
+    auto mul = make_shared<Multiply>(mem_r, input);
+    auto sig = make_shared<Sigmoid>(mul);
+
+    auto fc1_w = make_shared<Constant>(type, Shape{2, 2}, 1);
+    auto fc1 = make_shared<MatMul>(sig, fc1_w);
+
+    auto fc2_w = make_shared<Constant>(type, Shape{2, 2}, 1);
+    auto fc2 = make_shared<MatMul>(fc1, fc2_w);
+
+    auto mem_w = make_shared<Assign>(fc1, "id");
+    mem_w->set_friendly_name("mem_w");
+
+    // WA. Limitation of ngraph. control_dependency are required.
+    mem_w->add_control_dependency(mem_r);
+    fc2->add_control_dependency(mem_w);
+
+    auto function = make_shared<ngraph::Function>(
+            ngraph::NodeVector      {fc2},
+            ngraph::ParameterVector {input});
+
+    auto net = create_net(function, IE);
+
+    // Apply tested BF16 transformation
+    MKLDNNPlugin::BF16Transformer transformer;
+    transformer.convertToBFloat16(net);
+
+    // Check precision
+    auto layers = get_layer_collection(net);
+    IE_SUPPRESS_DEPRECATED_START
+    Precision prc_mem_r = layers["mem_r"]->outData[0]->getPrecision();
+    Precision prc_mem_w = layers["mem_w"]->insData[0].lock()->getPrecision();
+    IE_SUPPRESS_DEPRECATED_END
+
+    ASSERT_EQ(prc_mem_r, Precision::BF16);
+    ASSERT_EQ(prc_mem_w, Precision::BF16);
+}
index a8235b3..4480690 100644 (file)
@@ -271,7 +271,7 @@ class EltwiseOnlyTest : public TestsCommon,
                 if (p.op != eltwise_test_params::Pow)
                     CommonTestUtils::fill_data_sine(inputBlob->buffer().as<float*>(), inputBlob->size(), 100, 10, 10);
                 else
-                    CommonTestUtils::fill_data_const(inputBlob->buffer().as<float*>(), inputBlob->size(), 2);
+                    CommonTestUtils::fill_data_const(inputBlob, 2);
 
                 srcs_vec.push_back(inputBlob);
             }
index 357b37c..9b2d28f 100644 (file)
@@ -240,7 +240,7 @@ protected:
             Blob::Ptr output_low_data = make_shared_blob<float>({Precision::FP32, { p.ic_const_blobs }, Layout::C});
             output_low_data->allocate();
             if (p.levels == 2) {
-                CommonTestUtils::fill_data_const(output_low_data->buffer().as<float*>(), output_low_data->size(), low_val);
+                CommonTestUtils::fill_data_const(output_low_data, low_val);
             } else {
                 CommonTestUtils::fill_data_sine(output_low_data->buffer().as<float*>(), output_low_data->size(), low_center, 2.f, 0.3f);
             };
@@ -249,7 +249,7 @@ protected:
             Blob::Ptr output_high_data = make_shared_blob<float>({Precision::FP32, {p.ic_const_blobs}, Layout::C});
             output_high_data->allocate();
             if (p.levels == 2) {
-                CommonTestUtils::fill_data_const(output_high_data->buffer().as<float*>(), output_high_data->size(), high_val);
+                CommonTestUtils::fill_data_const(output_high_data, high_val);
             } else {
                 CommonTestUtils::fill_data_sine(output_high_data->buffer().as<float*>(), output_high_data->size(), high_center, 2.f, 0.3f);
             };
index f6581a6..8394b91 100644 (file)
@@ -34,7 +34,10 @@ void op::ReadValue::validate_and_infer_types()
     auto output_shape = get_input_partial_shape(0);
 
     VariableInfo info = {output_shape, arg_t, m_variable_id};
-    m_variable = std::make_shared<Variable>(info);
+    if (m_variable == nullptr)
+        m_variable = std::make_shared<Variable>(info);
+    else
+        m_variable->update(info);
     set_output_type(0, arg_t, output_shape);
 }