[CPU] Generic JIT Eltwise implementation (#1464)
authorGorokhov Dmitriy <dmitry.gorokhov@intel.com>
Wed, 28 Oct 2020 06:16:28 +0000 (09:16 +0300)
committerGitHub <noreply@github.com>
Wed, 28 Oct 2020 06:16:28 +0000 (09:16 +0300)
54 files changed:
inference-engine/src/mkldnn_plugin/CMakeLists.txt
inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
inference-engine/src/mkldnn_plugin/mkldnn_node.h
inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/common/emitter.h [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp [deleted file]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h [deleted file]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp [deleted file]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h [deleted file]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp [deleted file]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h [deleted file]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp
inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/test_graph.hpp
inference-engine/thirdparty/mkl-dnn
ngraph/python/tests/__init__.py
ngraph/python/tests/test_ngraph/test_ops_fused.py
ngraph/python/tests/test_onnx/test_backend.py
ngraph/python/tests/test_onnx/test_ops_logical.py
ngraph/python/tests/test_onnx/test_ops_nonlinear.py

index 2ed81eb..6557976 100644 (file)
@@ -9,7 +9,6 @@ if (WIN32)
 endif()
 
 set(LAYERS
-    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_activation_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_batchnorm_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_bin_conv_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_concat_node.cpp
@@ -17,7 +16,6 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_crop_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_deconv_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_def_conv_node.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_depthwise_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_eltwise_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_fullyconnected_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_gemm_node.cpp
@@ -27,7 +25,6 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_memory_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_permute_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_pooling_node.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_power_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_quantize_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reorder_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reshape_node.cpp
@@ -94,7 +91,10 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unique.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unsqueeze.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/softmax.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/emitter.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/interp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_eltwise_emitters.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_mkldnn_emitters.cpp
 
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/argmax.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/argmax_imp.cpp
index d871e71..06f4074 100644 (file)
@@ -150,19 +150,6 @@ MKLDNNDescriptor::operator std::shared_ptr<mkldnn::softmax_forward::desc>() {
     return typeDesc->getPtr();
 }
 
-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::depthwise_forward::desc> desc) {
-    this->desc.reset(new DescFwdImpl<mkldnn::depthwise_forward::desc>(desc));
-}
-
-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::depthwise_forward::desc>() {
-    DescFwdImpl<mkldnn::depthwise_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::depthwise_forward::desc> *>(desc.get());
-    if (typeDesc == nullptr) {
-        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
-    }
-    return typeDesc->getPtr();
-}
-
 MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc) {
     this->desc.reset(new DescFwdImpl<mkldnn::rnn_forward::desc>(desc));
 }
index bbdc50c..cd59e17 100644 (file)
@@ -37,9 +37,6 @@ public:
     explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::softmax_forward::desc> desc);
     operator std::shared_ptr<mkldnn::softmax_forward::desc>();
 
-    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::depthwise_forward::desc> desc);
-    operator std::shared_ptr<mkldnn::depthwise_forward::desc>();
-
     explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc);
     operator std::shared_ptr<mkldnn::rnn_forward::desc>();
 
index 2d0ca6e..f387b69 100644 (file)
@@ -30,6 +30,7 @@
 #include <unordered_set>
 #include <utility>
 #include <cstring>
+#include <legacy/details/ie_cnn_network_tools.h>
 
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
@@ -57,18 +58,17 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
     if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
 #ifdef USE_CNNNETWORK_LPT
         auto params = LayerTransformation::Params(true,  // updatePrecisions
-                                                    true,  // quantizeOutputs
-                                                    true,  // weightsToConst
-                                                    LayerTransformation::QuantizedTensorAlignment::UpdateLevel,  // quantizedTensorAlignmentOnActivations
-                                                    LayerTransformation::QuantizedTensorAlignment::None,  // quantizedTensorAlignmentOnWeights
-                                                    true,  // roundQuantizedValues
-                                                    true,  // updateBiases
-                                                    true);  // supportAsymmetricQuantization
+                                                  true,  // quantizeOutputs
+                                                  true,  // weightsToConst
+                                                  LayerTransformation::QuantizedTensorAlignment::UpdateLevel,  // quantizedTensorAlignmentOnActivations
+                                                  LayerTransformation::QuantizedTensorAlignment::None,  // quantizedTensorAlignmentOnWeights
+                                                  true,  // roundQuantizedValues
+                                                  true,  // updateBiases
+                                                  true);  // supportAsymmetricQuantization
         LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
             add<ConvolutionTransformation>(LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), "Convolution").
-            addCleanup<ScaleShiftToConvolutionTransformation>(
-                LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
-                "ScaleShift"));
+            remove("ScaleShift").
+            remove("Power"));
         transformer.transform(*_clonedNetwork);
 #endif
 
@@ -102,6 +102,59 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
 
     MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork&>(*_clonedNetwork));
 
+    auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, std::string name) {
+        LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
+        auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
+        constLayer->blobs["custom"] = blob;
+
+        std::vector<size_t> constDims(layer->insData[0].lock()->getDims().size(), 1);
+        if (constDims.size() > 1)
+            constDims[1] = blob.get()->size();
+        else
+            constDims[0] = blob.get()->size();
+        const TensorDesc& td = {blob->getTensorDesc().getPrecision(), constDims, TensorDesc::getLayoutByDims(constDims)};
+
+        DataPtr newEdgeAfterLayer(new Data(constLayer->name, td));
+        newEdgeAfterLayer->setName(constLayer->name);
+        getCreatorLayer(newEdgeAfterLayer) = constLayer;
+        getInputTo(newEdgeAfterLayer).clear();
+
+        _clonedNetwork->addData(constLayer->name.c_str(), newEdgeAfterLayer);
+        IE_SUPPRESS_DEPRECATED_START
+        _clonedNetwork->addLayer(constLayer);
+        IE_SUPPRESS_DEPRECATED_END
+
+        constLayer->outData.push_back(newEdgeAfterLayer);
+        getInputTo(newEdgeAfterLayer)[layer->name] = layer;
+        layer->insData.push_back(newEdgeAfterLayer);
+    };
+
+    auto all_layers = details::CNNNetSortTopologically(*_clonedNetwork);
+    for (auto &layer : all_layers) {
+        if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
+            Blob::Ptr scalesBlob = layer->blobs["weights"];
+            if (scalesBlob != nullptr)
+                createConstInputTo(layer, scalesBlob, "weights");
+
+            Blob::Ptr shiftBlob = layer->blobs["biases"];
+            if (shiftBlob != nullptr) {
+                createConstInputTo(layer, shiftBlob, "biases");
+            } else if (scalesBlob != nullptr) {
+                Blob::Ptr biases = make_shared_blob<float>(scalesBlob->getTensorDesc());
+                biases->allocate();
+                auto biasesPtr = biases->buffer().as<float*>();
+                for (size_t i = 0; i < biases->size(); i++)
+                    biasesPtr[i] = 0;
+
+                createConstInputTo(layer, biases, "biases");
+            }
+        } else if (layer->type == "PReLU" && layer->insData.size() == 1) {
+            Blob::Ptr scalesBlob = layer->blobs["weights"];
+            if (scalesBlob != nullptr)
+                createConstInputTo(layer, scalesBlob, "weights");
+        }
+    }
+
     if (_cfg.batchLimit > 1) {
         // check topology for applicability
         if (!CanProcessDynBatch(*_clonedNetwork)) {
@@ -272,7 +325,6 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &n
             type != SoftMax &&
             type != Split &&
             type != Concatenation &&
-            type != Power &&
             type != Eltwise &&
             type != Crop &&
             type != BatchNormalization &&
index 41e17fb..ccdef34 100644 (file)
@@ -6,10 +6,8 @@
 
 #include "mkldnn_extension_utils.h"
 #include "nodes/mkldnn_reshape_node.h"
-#include "nodes/mkldnn_activation_node.h"
 #include "nodes/mkldnn_pooling_node.h"
 #include "nodes/mkldnn_eltwise_node.h"
-#include "nodes/mkldnn_depthwise_node.h"
 #include "nodes/mkldnn_concat_node.h"
 #include "nodes/mkldnn_reorder_node.h"
 #include "nodes/mkldnn_conv_node.h"
@@ -18,6 +16,7 @@
 #include "nodes/mkldnn_mvn_node.h"
 #include "nodes/mkldnn_resample_node.h"
 #include "nodes/mkldnn_interpolate_node.h"
+#include "nodes/mkldnn_input_node.h"
 
 #include <blob_factory.hpp>
 #include <legacy/ie_layers_internal.hpp>
@@ -49,9 +48,6 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
     MergeTwoEqualScaleShifts(graph);
     graph.RemoveDroppedNodes();
 
-    MergeSigmoidAndMultiplyToSwish(graph);
-    graph.RemoveDroppedNodes();
-
     MergeConversions(graph);
     graph.RemoveDroppedNodes();
 
@@ -70,20 +66,14 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
     FuseConvolutionAndZeroPoints(graph);
     graph.RemoveDroppedNodes();
 
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
     FuseConvolutionAndDepthwise(graph);
     graph.RemoveDroppedNodes();
-#endif
 
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
     FuseConvolutionAndActivation(graph);
     graph.RemoveDroppedNodes();
-#endif
 
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
     FuseConvolutionAndDepthwise(graph);
     graph.RemoveDroppedNodes();
-#endif
 
     FuseConvolutionAndQuantize(graph);
     graph.RemoveDroppedNodes();
@@ -91,10 +81,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
     graph.SortTopologically();
     graph.RemoveDroppedEdges();
 
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
     FuseConvolutionAndDepthwise(graph);
     graph.RemoveDroppedNodes();
-#endif
 
     FusePoolingAndQuantize(graph);
     graph.RemoveDroppedNodes();
@@ -206,16 +194,6 @@ void MKLDNNGraphOptimizer::MergeConversions(MKLDNNGraph& graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableConvNode = [](MKLDNNNodePtr node) {
@@ -241,11 +219,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
         int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
 
         if (parent0->getType() == Eltwise) {
-            auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(parent0->getCnnLayer().get());
-            if (eltwiseLayer == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get eltwise layer " << node->getName();
-
-            if (eltwiseLayer->_operation != EltwiseLayer::Sub)
+            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
+            if (eltwiseNode->getOpType() != Subtract)
                 return false;
 
             if (parent0->getParentEdges().size() != 2)
@@ -296,11 +271,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
         int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
 
         if (parent0->getType() == Eltwise) {
-            auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(parent0->getCnnLayer().get());
-            if (eltwiseLayer == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get eltwise layer " << node->getName();
-
-            if (eltwiseLayer->_operation != EltwiseLayer::Sub)
+            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
+            if (eltwiseNode->getOpType() != Subtract)
                 return false;
 
             if (parent0->getParentEdges().size() != 2)
@@ -482,17 +454,17 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) {
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableScaleShiftNode = [](MKLDNNNodePtr node) {
-        if (node->getType() != Depthwise)
+        if (node->getType() != Eltwise)
             return false;
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Depthwise node";
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Eltwise node";
 
-        if (depthwiseNode->getChildEdges().size() != 1)
+        if (eltwiseNode->getChildEdges().size() != 1)
             return false;
 
-        if (depthwiseNode->getAlgorithm() != depthwise_scale_shift || depthwiseNode->isBroadcast())
+        if (eltwiseNode->getOpType() != MulAdd)
             return false;
 
         return true;
@@ -502,16 +474,16 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) {
         if (node1->getParentEdgeAt(0) != node2->getParentEdgeAt(0))
             return false;
 
-        auto *depthwiseNode1 = dynamic_cast<MKLDNNDepthwiseNode *>(node1.get());
-        auto *depthwiseNode2 = dynamic_cast<MKLDNNDepthwiseNode *>(node2.get());
+        auto *eltwiseNode1 = dynamic_cast<MKLDNNEltwiseNode *>(node1.get());
+        auto *eltwiseNode2 = dynamic_cast<MKLDNNEltwiseNode *>(node2.get());
 
-        auto depthwiseLayer1 = depthwiseNode1->getCnnLayer();
-        auto depthwiseLayer2 = depthwiseNode2->getCnnLayer();
+        auto eltwiseLayer1 = eltwiseNode1->getCnnLayer();
+        auto eltwiseLayer2 = eltwiseNode2->getCnnLayer();
 
-        Blob::Ptr scalesBlob1 = depthwiseLayer1->blobs["weights"];
-        Blob::Ptr shiftsBlob1 = depthwiseLayer1->blobs["biases"];
-        Blob::Ptr scalesBlob2 = depthwiseLayer2->blobs["weights"];
-        Blob::Ptr shiftsBlob2 = depthwiseLayer2->blobs["biases"];
+        Blob::Ptr scalesBlob1 = eltwiseLayer1->blobs["weights"];
+        Blob::Ptr shiftsBlob1 = eltwiseLayer1->blobs["biases"];
+        Blob::Ptr scalesBlob2 = eltwiseLayer2->blobs["weights"];
+        Blob::Ptr shiftsBlob2 = eltwiseLayer2->blobs["biases"];
         if (scalesBlob1 == nullptr || shiftsBlob1 == nullptr || scalesBlob2 == nullptr || shiftsBlob2 == nullptr)
             return false;
 
@@ -533,6 +505,16 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) {
     auto MergeScaleShiftNodes = [&](MKLDNNNodePtr childNode1, MKLDNNNodePtr childNode2) {
         auto parentNode = childNode2->getParentEdgeAt(0)->getParent();
         auto ccNode2 = childNode2->getChildEdgeAt(0)->getChild();
+
+        auto parentEdges = childNode2->parentEdges;
+        for (auto &parentEdge : parentEdges) {
+            auto p_edge = parentEdge.lock();
+            if (p_edge->getParent() == parentNode)
+                continue;
+
+            removeEdge(graph, p_edge);
+        }
+
         graph.DropNode(childNode2);
 
         MKLDNNEdgePtr remEdge;
@@ -572,103 +554,6 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) {
     }
 }
 
-void MKLDNNGraphOptimizer::MergeSigmoidAndMultiplyToSwish(MKLDNNGraph& graph) {
-    auto& graphNodes = graph.GetNodes();
-    std::vector<MKLDNNNodePtr> newNodes;
-
-    MKLDNNNodePtr parentNode;
-    MKLDNNNodePtr activationNode, eltwiseNode;
-    MKLDNNEdgePtr remEdge;
-
-    auto areSutableChildNodes = [&]() {
-        auto childNode1 = parentNode->getChildEdgeAt(0)->getChild();
-        auto childNode2 = parentNode->getChildEdgeAt(1)->getChild();
-
-        if (childNode1->getType() == Activation && childNode2->getType() == Eltwise) {
-            activationNode = childNode1;
-            eltwiseNode = childNode2;
-            remEdge = parentNode->getChildEdgeAt(1);
-        } else if (childNode1->getType() == Eltwise && childNode2->getType() == Activation) {
-            activationNode = childNode2;
-            eltwiseNode = childNode1;
-            remEdge = parentNode->getChildEdgeAt(0);
-        } else {
-            return false;
-        }
-
-        if (activationNode->getParentEdges().size() != 1 || activationNode->getChildEdges().size() != 1)
-            return false;
-
-        if (eltwiseNode->getParentEdges().size() != 2)
-            return false;
-
-        if (activationNode->getChildEdgeAt(0)->getChild() != eltwiseNode)
-            return false;
-
-        auto *activationNodePtr = dynamic_cast<MKLDNNActivationNode *>(activationNode.get());
-        if (activationNodePtr == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << activationNode->getName() << " to Activation node";
-        if (activationNodePtr->getAlgorithm() != eltwise_logistic)
-            return false;
-
-        auto *eltwiseNodePtr = dynamic_cast<MKLDNNEltwiseNode *>(eltwiseNode.get());
-        if (eltwiseNodePtr == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << eltwiseNode->getName() << " to Eltwise node";
-        auto *eltwiseLayer = dynamic_cast<EltwiseLayer*>(eltwiseNode->getCnnLayer().get());
-        if (eltwiseLayer == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get eltwise layer " << eltwiseNode->getName();
-        if (eltwiseLayer->_operation != EltwiseLayer::Prod)
-            return false;
-
-        return true;
-    };
-
-    auto MergeToSwish = [&]() {
-        //  1. Remove edge Parent-Eltwise
-        remEdge->drop();
-        graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), remEdge), graph.GetEdges().end());
-
-        //  2. Remove Sigmoid node and edges Parent-Sigmoid and Sigmoid-Eltwise
-        graph.DropNode(activationNode);
-        remEdge = parentNode->getChildEdgeAt(0);
-        auto oIndex = remEdge->getOutputNum();
-        auto iIndex = remEdge->getInputNum();
-        remEdge->drop();
-        graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), remEdge), graph.GetEdges().end());
-
-        //  3. Create Swish node
-        CNNLayerPtr swishLayer(new CNNLayer(*activationNode->getCnnLayer().get()));
-        swishLayer->name = activationNode->getName() + "_Swish";
-        swishLayer->type = "Swish";
-        MKLDNNNodePtr swishNode(new MKLDNNActivationNode(swishLayer, graph.getEngine(), graph.weightsCache));
-
-        //  4. Create edges Parent-Swish and Swish-Eltwise, connect to Swish node, add edges to graph
-        MKLDNNEdgePtr beforeSwishEdge(new MKLDNNEdge(parentNode, swishNode, iIndex, 0));
-        MKLDNNEdgePtr afterSwishEdge(new MKLDNNEdge(swishNode, eltwiseNode, 0, oIndex));
-        swishNode->addEdge(beforeSwishEdge);
-        swishNode->addEdge(afterSwishEdge);
-        graph.GetEdges().push_back(beforeSwishEdge);
-        graph.GetEdges().push_back(afterSwishEdge);
-        newNodes.push_back(swishNode);
-
-        //  5. Remove Eltwise node
-        graph.DropNode(eltwiseNode);
-    };
-
-    for (int i = 0; i < graphNodes.size(); i++) {
-        parentNode = graphNodes[i];
-        if (parentNode->getChildEdges().size() != 2)
-            continue;
-
-        if (!areSutableChildNodes()) continue;
-
-        MergeToSwish();
-    }
-    for (int i = 0; i < newNodes.size(); i++) {
-        graph.GetNodes().push_back(newNodes[i]);
-    }
-}
-
 void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
     auto &graphNodes = graph.GetNodes();
 
@@ -683,8 +568,18 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
                                 return x->getName() == node_name;}) == outputNodes.end()) {
                 if (bn->getChildEdges().size() == 1) {
                     auto child = bn->getChildEdgeAt(0)->getChild();
-                    if (child->type == Depthwise && child->getCnnLayer()->type == "ScaleShift") {
+                    if (child->type == Eltwise && child->getCnnLayer()->type == "ScaleShift") {
                         bn->fuseWith(child);
+
+                        auto parentEdges = child->parentEdges;
+                        for (auto &parentEdge : parentEdges) {
+                            auto p_edge = parentEdge.lock();
+                            if (p_edge->getParent()->getType() == BatchNormalization)
+                                continue;
+
+                            removeEdge(graph, p_edge);
+                        }
+
                         graph.DropNode(child);
                     }
                 }
@@ -693,30 +588,19 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
     }
 }
 
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
 void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
-    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
-        for (auto a : algs) {
-            if (alg == a) {
-                return true;
-            }
-        }
-        return false;
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
         if (!activation->getCnnLayer())
             return false;
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(activation.get());
 
-        return activationNode &&
-            (activationNode->getAlgorithm() == eltwise_relu ||
+        return eltwiseNode &&
+            (eltwiseNode->getOpType() == Relu ||
             (conv->getCnnLayer()->precision == Precision::FP32 &&
-             isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp,
-                                                      eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid})));
+            IsOneOf(eltwiseNode->getOpType(), {Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})));
     };
 
     for (int i = 0; i < graphNodes.size(); i++) {
@@ -766,25 +650,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph) {
-    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
-        for (auto a : algs) {
-            if (alg == a) {
-                return true;
-            }
-        }
-        return false;
-    };
-
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -808,55 +673,52 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra
                         quantizeNode->isOutputLowBroadcast() && quantizeNode->isOutputHighBroadcast() &&
                         !quantizeNode->isBinarization());
             }
-        } else if (childNode->getType() == Depthwise) {
-            auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(childNode.get());
-            if (depthwiseNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get depthwise layer " << childNode->getName();
+        } else if (childNode->getType() == Eltwise) {
+            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(childNode.get());
+            if (eltwiseNode == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get Eltwise node " << childNode->getName();
 
-            if (parentNode->getParentEdgesAtPort(0)[0]->getDims().ndims() != 3) {
-                return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift &&
-                         depthwiseNode->isWithBiases()) ||
-                        (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
-            } else {
-                const auto &depthwiseLayer = depthwiseNode->getCnnLayer();
-                if (depthwiseLayer == nullptr)
-                    THROW_IE_EXCEPTION << "Cannot get scale shift layer " << depthwiseNode->getName();
-
-                if (depthwiseNode->getAlgorithm() != mkldnn::algorithm::depthwise_scale_shift)
-                    return false;
-
-                Blob::Ptr scalesBlob = depthwiseLayer->blobs["weights"];
-                if (scalesBlob == nullptr)
+            if (IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})) {
+                return true;
+            } else if (IsOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu})) {
+                if (eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() != 2)
                     return false;
 
-                Blob::Ptr shiftsBlob = depthwiseLayer->blobs["biases"];
-                if (shiftsBlob == nullptr)
-                    return false;
+                if (parentNode->getParentEdgesAtPort(0)[0]->getDims().ndims() != 3) {
+                    return true;
+                } else {
+                    const auto &eltwiseLayer = eltwiseNode->getCnnLayer();
+                    if (eltwiseLayer == nullptr)
+                        THROW_IE_EXCEPTION << "Cannot get scale shift layer " << eltwiseNode->getName();
 
-                const float* scalesBufferPtr = scalesBlob->buffer().as<float*>();
-                const float* shiftsBufferPtr = shiftsBlob->buffer().as<float*>();
+                    if (eltwiseNode->getOpType() != MulAdd)
+                        return false;
 
-                if (scalesBlob->size() != shiftsBlob->size())
-                    return false;
+                    Blob::Ptr scalesBlob = eltwiseLayer->blobs["weights"];
+                    if (scalesBlob == nullptr)
+                        return false;
 
-                for (int i = 1; i < scalesBlob->size(); i++)
-                    if (scalesBufferPtr[0] != scalesBufferPtr[i])
+                    Blob::Ptr shiftsBlob = eltwiseLayer->blobs["biases"];
+                    if (shiftsBlob == nullptr)
                         return false;
 
-                for (int i = 1; i < shiftsBlob->size(); i++)
-                    if (shiftsBufferPtr[0] != shiftsBufferPtr[i])
+                    const float *scalesBufferPtr = scalesBlob->buffer().as<float *>();
+                    const float *shiftsBufferPtr = shiftsBlob->buffer().as<float *>();
+
+                    if (scalesBlob->size() != shiftsBlob->size())
                         return false;
 
-                return true;
-            }
-        } else if (childNode->getType() == Activation) {
-            auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(childNode.get());
-            if (activationNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get activation layer " << childNode->getName();
+                    for (int i = 1; i < scalesBlob->size(); i++)
+                        if (scalesBufferPtr[0] != scalesBufferPtr[i])
+                            return false;
 
-            return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic,
-                                                            eltwise_bounded_relu, eltwise_clamp, eltwise_swish, eltwise_hswish,
-                                                            eltwise_mish, eltwise_hsigmoid});
+                    for (int i = 1; i < shiftsBlob->size(); i++)
+                        if (shiftsBufferPtr[0] != shiftsBufferPtr[i])
+                            return false;
+
+                    return true;
+                }
+            }
         }
 
         return false;
@@ -878,7 +740,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra
 
         parentNode->fuseWith(childNode);
 
-        if (childNode->getType() == Quantize) {
+        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
             auto parentEdges = childNode->parentEdges;
             for (auto &parentEdge : parentEdges) {
                 auto p_edge = parentEdge.lock();
@@ -892,9 +754,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra
         graph.DropNode(childNode);
     }
 }
-#endif
 
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
 void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
 
@@ -906,17 +766,17 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
     };
 
     auto isSutableChildNode = [](MKLDNNNodePtr node) {
-        if (node->getType() != Depthwise)
+        if (node->getType() != Eltwise)
             return false;
 
         if (!node->getCnnLayer())
             return false;
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get depthwise node " << node->getName();
-        return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
-                (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
+        return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) ||
+                (eltwiseNode->getOpType() == Prelu));
     };
 
     for (int i = 0; i < graphNodes.size(); i++) {
@@ -933,14 +793,32 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
 
             if (isSutableChildNode(depthwise1)) {
                 conv->fuseWith(depthwise1);
+
+                auto parents = depthwise1->parentEdges;
+                for (size_t j = 0; j < parents.size(); j++) {
+                    auto p_edge = parents[j].lock();
+                    if (p_edge->getParent()->getType() == Eltwise)
+                        continue;
+
+                    removeEdge(graph, p_edge);
+                }
+
                 graph.DropNode(depthwise1);
             }
         }
 
+        auto parents = depthwise0->parentEdges;
+        for (size_t j = 0; j < parents.size(); j++) {
+            auto p_edge = parents[j].lock();
+            if (p_edge->getParent()->getType() == Convolution || p_edge->getParent()->getType() == BinaryConvolution)
+                continue;
+
+            removeEdge(graph, p_edge);
+        }
+
         graph.DropNode(depthwise0);
     }
 }
-#endif
 
 void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
@@ -1088,16 +966,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
 
 #if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
 void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1151,25 +1019,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) {
-    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
-        for (auto a : algs) {
-            if (alg == a) {
-                return true;
-            }
-        }
-        return false;
-    };
-
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1188,21 +1037,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph)
                 THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
 
             return !quantizeNode->isBinarization();
-        } else if (node->getType() == Depthwise) {
-            auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
-            if (depthwiseNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
-
-            return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
-                    (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
-        } else if (node->getType() == Activation) {
-            auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
-            if (activationNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
+        } else if (node->getType() == Eltwise) {
+            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+            if (eltwiseNode == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
 
-            return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_elu, eltwise_logistic, eltwise_bounded_relu,
-                                                            eltwise_clamp, eltwise_swish, eltwise_hswish, eltwise_mish,
-                                                            eltwise_hsigmoid});
+            return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) ||
+                    (eltwiseNode->getOpType() == Prelu) ||
+                    IsOneOf(eltwiseNode->getOpType(), {Relu, Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid}));
         }
 
         return false;
@@ -1224,7 +1066,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph)
 
         parentNode->fuseWith(childNode);
 
-        if (childNode->getType() == Quantize) {
+        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
             auto parentEdges = childNode->parentEdges;
             for (auto &parentEdge : parentEdges) {
                 auto p_edge = parentEdge.lock();
@@ -1240,16 +1082,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph)
 }
 
 void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1294,16 +1126,6 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
 }
 
 void MKLDNNGraphOptimizer::FusePoolingAndQuantize(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1427,30 +1249,16 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
 void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
     std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
 
-    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
-        for (auto a : algs) {
-            if (alg == a) {
-                return true;
-            }
-        }
-        return false;
-    };
-
     auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
         if (!activation->getCnnLayer())
             return false;
 
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(activation.get());
 
-        return activationNode &&
-            (activationNode->getAlgorithm() == eltwise_relu ||
+        return eltwiseNode &&
+            (eltwiseNode->getOpType() == Relu ||
             (conv->getCnnLayer()->precision == Precision::FP32 &&
-             isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp,
-                                                      eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid})));
-#else
-        return false;
-#endif
+             IsOneOf(eltwiseNode->getOpType(), {Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})));
     };
 
     for (auto &graphNode : graphNodes) {
@@ -1458,7 +1266,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
             continue;
 
         if (!std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isSum()) continue;
-        if (!std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isUnitScales()) continue;
         if (std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isWithBroadcast()) continue;
 
         // TODO: Enlarge to several inputs
@@ -1582,16 +1389,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
 #endif
 
 void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1617,16 +1414,14 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
             if (quantizeNode == nullptr)
                 THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
             return !quantizeNode->isBinarization();
-        } else if (node->getType() == Depthwise) {
-            auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
-            if (depthwiseNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
-            return depthwiseNode->cnnLayer->type == "ScaleShift";
-        } else if (node->getType() == Activation) {
-            auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
-            if (activationNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
-            return activationNode->getAlgorithm() == eltwise_relu;
+        } else if (node->getType() == Eltwise) {
+            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+            if (eltwiseNode == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
+
+            return ((eltwiseNode->getOpType() == MulAdd) ||
+                    (eltwiseNode->getOpType() == Prelu) ||
+                     eltwiseNode->getOpType() == Relu);
         }
 
         return false;
@@ -1648,7 +1443,7 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
 
         parentNode->fuseWith(childNode);
 
-        if (childNode->getType() == Quantize) {
+        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
             auto parentEdges = childNode->parentEdges;
             for (auto &parentEdge : parentEdges) {
                 auto p_edge = parentEdge.lock();
@@ -1664,16 +1459,6 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1699,16 +1484,12 @@ void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) {
             if (quantizeNode == nullptr)
                 THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
             return !quantizeNode->isBinarization();
-        } else if (node->getType() == Depthwise) {
-            auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
-            if (depthwiseNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
-            return depthwiseNode->cnnLayer->type == "ScaleShift";
-        } else if (node->getType() == Activation) {
-            auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
-            if (activationNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
-            return activationNode->getAlgorithm() == eltwise_relu;
+        } else if (node->getType() == Eltwise) {
+            auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+            if (eltwiseNode == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName();
+            return eltwiseNode->getOpType() == Relu ||
+                   eltwiseNode->getOpType() == MulAdd;
         }
 
         return false;
@@ -1730,7 +1511,7 @@ void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) {
 
         parentNode->fuseWith(childNode);
 
-        if (childNode->getType() == Quantize) {
+        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
             auto parentEdges = childNode->parentEdges;
             for (auto &parentEdge : parentEdges) {
                 auto p_edge = parentEdge.lock();
@@ -1746,16 +1527,6 @@ void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) {
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSuitableParentNode = [](MKLDNNNodePtr node) {
@@ -1798,7 +1569,7 @@ void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph)
 
         parentNode->fuseWith(childNode);
 
-        if (childNode->getType() == Quantize || childNode->getType() == Depthwise || childNode->getType() == Activation) {
+        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
             auto parentEdges = childNode->parentEdges;
             for (auto &parentEdge : parentEdges) {
                 auto p_edge = parentEdge.lock();
@@ -1814,25 +1585,6 @@ void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph)
 }
 
 void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) {
-    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
-        for (auto a : algs) {
-            if (alg == a) {
-                return true;
-            }
-        }
-        return false;
-    };
-
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
@@ -1854,20 +1606,16 @@ void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) {
             if (quantizeNode == nullptr)
                 THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
             return !quantizeNode->isBinarization();
-        } else if (node->getType() == Depthwise) {
-            auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
-            if (depthwiseNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
-            return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
-                    (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
-        } else if (node->getType() == Activation) {
-            auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
-            if (activationNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
-            return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic,
-                eltwise_bounded_relu, eltwise_clamp, eltwise_tanh, eltwise_swish, eltwise_hswish, eltwise_mish,
-                eltwise_hsigmoid, eltwise_linear, eltwise_abs, eltwise_square, eltwise_sqrt});
+        } else if (node->getType() == Eltwise) {
+            auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+            if (eltwiseNode == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName();
+            return IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Tanh, Swish,
+                                                      Hswish, Mish, Hsigmoid, Linear, Abs, Square, Sqrt}) ||
+                    ((eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2) ||
+                     (eltwiseNode->getOpType() == Prelu));
         }
+
         return false;
     };
 
@@ -1887,7 +1635,7 @@ void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) {
 
         parentNode->fuseWith(childNode);
 
-        if (childNode->getType() == Quantize) {
+        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
             auto parentEdges = childNode->parentEdges;
             for (auto &parentEdge : parentEdges) {
                 auto p_edge = parentEdge.lock();
@@ -1903,85 +1651,31 @@ void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
-    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
-        for (auto a : algs) {
-            if (alg == a) {
-                return true;
-            }
-        }
-        return false;
-    };
-
-    auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
-        auto& edges = graph.GetEdges();
-        for (auto it = edges.begin(); it != edges.end(); it++) {
-            if ((*it) == edge) {
-                edges.erase(it);
-                return;
-            }
-        }
-    };
-
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
-        bool isSutableEltwise = node->getType() == Eltwise;
+        return node->getType() == Eltwise && node->getChildEdges().size() == 1;
+    };
 
-        if (isSutableEltwise) {
-            auto *eltwiseLayer = dynamic_cast<EltwiseLayer *>(node->getCnnLayer().get());
-            if (eltwiseLayer == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get Eltwise layer " << node->getName();
+    auto isSutableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+        for (auto &childParentEdge : childNode->getParentEdges()) {
+            // WA to prevent unsupported reorder exception issue in some cases
+            if (childParentEdge.lock()->getParent()->getType() == Split) {
+                return false;
+            }
 
-            ptrdiff_t maxChannels = 1;
-            for (size_t i = 0; i < node->getParentEdges().size(); i++) {
-                if (node->getParentEdgeAt(0)->getDims().ndims() != node->getParentEdgeAt(i)->getDims().ndims())
-                    return false;
-                if (node->getParentEdgeAt(i)->getDims().ndims() != 2 &&
-                    node->getParentEdgeAt(i)->getDims().ndims() != 4 &&
-                    node->getParentEdgeAt(i)->getDims().ndims() != 5)
+            // Avoid cycle dependencies
+            for (auto &parentParentEdge : parentNode->getParentEdges()) {
+                if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent())
                     return false;
-                if (maxChannels < node->getParentEdgeAt(i)->getDims()[1])
-                    maxChannels = node->getParentEdgeAt(i)->getDims()[1];
             }
-
-            int simdWidth = mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common) ? 16 :
-                            mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx2) ? 8 : 4;
-            if (maxChannels < simdWidth)
-                return false;
-
-            return node->getChildEdges().size() == 1 &&
-                   (eltwiseLayer->_operation == EltwiseLayer::Sum || eltwiseLayer->_operation == EltwiseLayer::Prod) &&
-                   !node->isFusedWith(Quantize);
-        } else {
-            return false;
         }
-    };
 
-    auto isSutableChildNode = [&](MKLDNNNodePtr node) {
-        if (!node->getCnnLayer())
+        if (!childNode->getFusedWith().empty())
             return false;
 
-        if (node->getType() == Quantize) {
-            auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
-            if (quantizeNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
-            return !quantizeNode->isBinarization();
-        } else if (node->getType() == Activation) {
-            // Applicability was narrowed down in order not to affect FP32 topologies
-            if (node->getChildEdges().size() != 1)
-                return false;
-            if (node->getChildEdgeAt(0)->getChild()->getType() != Quantize)
-                return false;
-
-            auto *activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-            if (activationNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
-            return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_elu, eltwise_logistic, eltwise_bounded_relu,
-                                                            eltwise_clamp, eltwise_swish, eltwise_hswish, eltwise_mish,
-                                                            eltwise_hsigmoid});
-        }
-
-        return false;
+        auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(parentNode.get());
+        return eltwiseNode->canFuse(childNode);
     };
 
     auto parent = graphNodes.begin();
@@ -1993,7 +1687,7 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
         }
 
         auto childNode = parentNode->getChildEdgeAt(0)->getChild();
-        if (!isSutableChildNode(childNode)) {
+        if (!isSutableChildNode(parentNode, childNode)) {
             parent++;
             continue;
         }
@@ -2009,9 +1703,70 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
 
                 removeEdge(graph, p_edge);
             }
-        }
 
-        graph.DropNode(childNode);
+            graph.DropNode(childNode);
+        } else if (childNode->getType() == Eltwise) {
+            auto childs = childNode->childEdges;
+            auto parents = childNode->parentEdges;
+
+            for (size_t i = 0; i < parents.size(); i++) {
+                auto p_edge = parents[i].lock();
+                if (!p_edge) continue;
+                auto parent = p_edge->getParent();
+                if (!parent) continue;
+
+                if (parent == parentNode) {
+                    for (size_t j = 0; j < childs.size(); j++) {
+                        if (!childs[j].lock())
+                            continue;
+                        auto child = childs[j].lock()->getChild();
+                        if (!child)
+                            continue;
+
+                        MKLDNNEdgePtr &remEdge = p_edge;
+                        int inNum = 0;
+                        if (remEdge) {
+                            inNum = remEdge->getInputNum();
+                            remEdge->drop();
+                            removeEdge(graph, remEdge);
+                        }
+                        remEdge = childs[j].lock();
+                        int outNum = 0;
+                        if (remEdge) {
+                            outNum = remEdge->getOutputNum();
+                            remEdge->drop();
+                            removeEdge(graph, remEdge);
+                        }
+                        MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
+                        auto &graphEdges = graph.GetEdges();
+                        graphEdges.push_back(newEdge);
+                        parent->addEdge(newEdge);
+
+                        parent->outDims[inNum] = child->inDims[outNum];
+                    }
+                } else {
+                    MKLDNNEdgePtr &remEdge = p_edge;
+                    int inNum = 0;
+                    if (remEdge) {
+                        inNum = remEdge->getInputNum();
+                        remEdge->drop();
+                        removeEdge(graph, remEdge);
+                    }
+
+                    auto parentEltwise = parentNode;
+                    MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size()));
+                    auto &graphEdges = graph.GetEdges();
+                    graphEdges.push_back(newEdge);
+                    parent->addEdge(newEdge);
+
+                    parentEltwise->inDims.push_back(parent->outDims[0]);
+                }
+            }
+
+            graph.DropNode(childNode);
+        } else {
+            graph.DropNode(childNode);
+        }
     }
 }
 
@@ -2019,15 +1774,18 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
         bool toDrop = false;
 
-        if (node->getType() == Power) {
-            PowerLayer* l = dynamic_cast<PowerLayer*>(node->getCnnLayer().get());
-            if (l == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName();
+        if (node->getType() == Eltwise) {
+            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
+            if (eltwiseNode->getOpType() == PowerStatic) {
+                PowerLayer *l = dynamic_cast<PowerLayer *>(node->getCnnLayer().get());
+                if (l == nullptr)
+                    THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName();
 
-            if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true;
+                if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true;
+            }
         }
 
-        if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
+        if (node->getType() == Eltwise && node->getCnnLayer()->type == "ScaleShift") {
             ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
             if (l == nullptr)
                 THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName();
@@ -2177,7 +1935,7 @@ void MKLDNNGraphOptimizer::DropConvertReorder(MKLDNNGraph& graph) {
 
 void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
-        if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
+        if (node->getType() == Eltwise && node->getCnnLayer()->type == "ScaleShift") {
             ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
             if (l == nullptr)
                 THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName();
@@ -2235,6 +1993,25 @@ bool MKLDNNGraphOptimizer::IsOneOf(Type type, std::vector<Type> types) {
     return false;
 }
 
+bool MKLDNNGraphOptimizer::IsOneOf(EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
+    for (auto a : algs) {
+        if (alg == a) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void MKLDNNGraphOptimizer::removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
+    auto& edges = graph.GetEdges();
+    for (auto it = edges.begin(); it != edges.end(); it++) {
+        if ((*it) == edge) {
+            edges.erase(it);
+            return;
+        }
+    }
+}
+
 void MKLDNNGraphOptimizer::FuseBroadcastAndEltwise(MKLDNNGraph &graph) {
     std::vector<MKLDNNNodePtr>& graphNodes = graph.GetNodes();
 
@@ -2269,17 +2046,17 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableClampNode = [](MKLDNNNodePtr node) {
-        if (node->getType() != Activation)
+        if (node->getType() != Eltwise)
             return false;
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Activation node";
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Eltwise node";
 
-        if (activationNode->getChildEdges().size() != 1)
+        if (eltwiseNode->getChildEdges().size() != 1)
             return false;
 
-        if (activationNode->getAlgorithm() != eltwise_clamp)
+        if (eltwiseNode->getOpType() != Clamp)
             return false;
 
         return true;
@@ -2297,9 +2074,9 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) {
     };
 
     auto fuseClampAndQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) {
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(parent.get());
-        if (activationNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Activation node";
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Eltwise node";
 
         auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(child.get());
         if (quantizeNode == nullptr)
@@ -2311,9 +2088,9 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) {
         std::vector<float> newCropLow(cropLowData.size());
         std::vector<float> newCropHigh(cropHighData.size());
         for (int i = 0; i < cropLowData.size(); i++)
-            newCropLow[i] = std::max(cropLowData[i], activationNode->getBeta());
+            newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getBeta());
         for (int i = 0; i < cropHighData.size(); i++)
-            newCropHigh[i] = std::min(cropHighData[i], activationNode->getAlpha());
+            newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getAlpha());
 
         quantizeNode->setCropLow(newCropLow);
         quantizeNode->setCropHigh(newCropHigh);
@@ -2338,17 +2115,17 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableScaleShiftNode = [](MKLDNNNodePtr node) {
-        if (node->getType() != Depthwise)
+        if (node->getType() != Eltwise)
             return false;
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Depthwise node";
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to eltwise node";
 
-        if (depthwiseNode->getChildEdges().size() != 1)
+        if (eltwiseNode->getChildEdges().size() != 1)
             return false;
 
-        if (depthwiseNode->getAlgorithm() != depthwise_scale_shift || depthwiseNode->isBroadcast())
+        if (eltwiseNode->getOpType() != MulAdd)
             return false;
 
         return true;
@@ -2366,23 +2143,23 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) {
     };
 
     auto fuseScaleShiftAndQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) {
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(parent.get());
-        if (depthwiseNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Depthwise node";
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to eltwise node";
 
-        auto depthwiseLayer = depthwiseNode->getCnnLayer();
-        if (depthwiseLayer == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get scale shift layer " << depthwiseNode->getName();
+        auto eltwiseLayer = eltwiseNode->getCnnLayer();
+        if (eltwiseLayer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get scale shift layer " << eltwiseNode->getName();
 
         auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(child.get());
         if (quantizeNode == nullptr)
             THROW_IE_EXCEPTION << "Cannot cast " << child->getName() << " to Quantize node";
 
-        Blob::Ptr scalesBlob = depthwiseLayer->blobs["weights"];
+        Blob::Ptr scalesBlob = eltwiseLayer->blobs["weights"];
         if (scalesBlob == nullptr)
             return false;
 
-        Blob::Ptr shiftsBlob = depthwiseLayer->blobs["biases"];
+        Blob::Ptr shiftsBlob = eltwiseLayer->blobs["biases"];
         if (shiftsBlob == nullptr)
             return false;
 
@@ -2447,6 +2224,15 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) {
         if (!isSutableQuantizeNode(child)) continue;
 
         if (fuseScaleShiftAndQuantizeNodes(parent, child)) {
+            auto parentEdges = parent->parentEdges;
+            for (auto &parentEdge : parentEdges) {
+                auto p_edge = parentEdge.lock();
+                if (p_edge->getParent()->getCnnLayer()->type != "Const")
+                    continue;
+
+                removeEdge(graph, p_edge);
+            }
+
             graph.DropNode(parent);
         }
     }
index 2feb0f2..54bdda6 100644 (file)
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "mkldnn_graph.h"
+#include "nodes/mkldnn_eltwise_node.h"
 #include <vector>
 
 namespace MKLDNNPlugin {
@@ -18,18 +19,12 @@ public:
     void ApplyImplSpecificGraphOptimizations(MKLDNNGraph& graph);
 
 private:
-    void SLTMTransform(MKLDNNGraph& graph);
     void MergeConversions(MKLDNNGraph& graph);
     void MergeGroupConvolution(MKLDNNGraph& graph);
     void MergeTwoEqualScaleShifts(MKLDNNGraph& graph);
-    void MergeSigmoidAndMultiplyToSwish(MKLDNNGraph& graph);
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
     void FuseConvolutionAndActivation(MKLDNNGraph &graph);
     void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
-#endif
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
     void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
-#endif
     void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
     void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);
 #if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
@@ -59,6 +54,9 @@ private:
     void FuseClampAndQuantize(MKLDNNGraph &graph);
 
     bool IsOneOf(Type type, std::vector<Type> types);
+    bool IsOneOf(EltwiseOpType alg, std::vector<EltwiseOpType> algs);
+
+    void removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge);
 };
 
 }  // namespace MKLDNNPlugin
index f459688..ff96a75 100644 (file)
 #include <nodes/mkldnn_input_node.h>
 #include <nodes/mkldnn_lrn_node.h>
 #include <nodes/mkldnn_pooling_node.h>
-#include <nodes/mkldnn_power_node.h>
-#include <nodes/mkldnn_activation_node.h>
 #include <nodes/mkldnn_reorder_node.h>
 #include <nodes/mkldnn_reshape_node.h>
 #include <nodes/mkldnn_roi_pooling_node.h>
-#include <nodes/mkldnn_depthwise_node.h>
 #include <nodes/mkldnn_softmax_node.h>
 #include <nodes/mkldnn_tile_node.h>
 #include <nodes/mkldnn_split_node.h>
@@ -63,23 +60,23 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "Output", Output },
         { "Reorder", Reorder },
         { "Convolution", Convolution },
-        { "ReLU", Activation },
-        { "GELU", Activation },
-        { "ELU", Activation },
-        { "Sigmoid", Activation },
-        { "Logistic", Activation },
-        { "TanH", Activation },
-        { "ReLU6", Activation },
-        { "Exp", Activation },
-        { "Not", Activation },
-        { "Activation", Activation },
-        { "Clamp", Activation },
-        { "Swish", Activation },
-        { "HSwish", Activation },
-        { "Mish", Activation },
-        { "HSigmoid", Activation },
-        { "ScaleShift", Depthwise },
-        { "PReLU", Depthwise },
+        { "ReLU", Eltwise },
+        { "GELU", Eltwise },
+        { "ELU", Eltwise },
+        { "Sigmoid", Eltwise },
+        { "Logistic", Eltwise },
+        { "TanH", Eltwise },
+        { "ReLU6", Eltwise },
+        { "Exp", Eltwise },
+        { "Not", Eltwise },
+        { "Activation", Eltwise },
+        { "Clamp", Eltwise },
+        { "Swish", Eltwise },
+        { "HSwish", Eltwise },
+        { "Mish", Eltwise },
+        { "HSigmoid", Eltwise },
+        { "ScaleShift", Eltwise },
+        { "PReLU", Eltwise },
         { "Norm", Lrn },
         { "LRN", Lrn },
         { "Pooling", Pooling },
@@ -91,9 +88,10 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "Split", Split },
         { "Slice", Split },
         { "Concat", Concatenation },
-        { "Power", Power },
         { "Deconvolution", Deconvolution },
         { "Eltwise", Eltwise },
+        { "Mod", Eltwise },
+        { "Power", Eltwise },
         { "Crop", Crop },
         { "Reshape", Reshape },
         { "Tile", Tile },
index f5f6953..469cc7a 100644 (file)
@@ -44,7 +44,6 @@ enum Type {
     SoftMax,
     Split,
     Concatenation,
-    Power,
     Eltwise,
     Gemm,
     Crop,
@@ -118,8 +117,6 @@ static std::string NameFromType(Type type) {
             return "Split";
         case Concatenation:
             return "Concatenation";
-        case Power:
-            return "Power";
         case Depthwise:
             return "Depthwise";
         case Crop:
index 97f97af..7df7a9f 100644 (file)
@@ -43,6 +43,7 @@
 #include <transformations/op_conversions/softplus_decomposition.hpp>
 #include <transformations/op_conversions/convert_space_to_batch.hpp>
 #include <transformations/op_conversions/convert_batch_to_space.hpp>
+#include <transformations/op_conversions/convert_mod.hpp>
 #include <transformations/convert_precision.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/rt_info/fused_names_attribute.hpp>
@@ -145,6 +146,7 @@ static void Transformation(ICNNNetwork::Ptr& clonedNetwork, const Config& conf)
     pass_config->disable<ngraph::pass::ReduceL2Decomposition>();
     pass_config->disable<ngraph::pass::SoftPlusDecomposition>();
     pass_config->disable<ngraph::pass::HSigmoidDecomposition>();
+    pass_config->disable<ngraph::pass::ConvertMod>();
 
     pass_config->enable<ngraph::pass::ConvertPadToGroupConvolution>();
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp
new file mode 100644 (file)
index 0000000..8719a48
--- /dev/null
@@ -0,0 +1,200 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "emitter.h"
+#include <vector>
+
+using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl;
+using namespace Xbyak;
+
+namespace MKLDNNPlugin {
+
+template <typename T, typename P>
+constexpr bool one_of(T val, P item) { return val == item; }
+
+template <typename T, typename P, typename... Args>
+constexpr bool one_of(T val, P item, Args... item_others) {
+    return val == item || one_of(val, item_others...);
+}
+
+
+size_t jit_emitter::get_max_vecs_count() const {
+    return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 32 : 16;
+}
+
+size_t jit_emitter::get_vec_length() const {
+    return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 64 :
+           one_of(host_isa_, cpu::avx2) ? 32 : 16;
+}
+
+void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const {
+    if (host_isa_ == cpu::sse42) {
+        h->uni_vmovups(addr, Xmm(vec_idx));
+    } else if (host_isa_ == cpu::avx2) {
+        h->uni_vmovups(addr, Ymm(vec_idx));
+    } else {
+        h->uni_vmovups(addr, Zmm(vec_idx));
+    }
+}
+
+void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const {
+    if (host_isa_ == cpu::sse42) {
+        h->uni_vmovups(Xmm(vec_idx), addr);
+    } else if (host_isa_ == cpu::avx2) {
+        h->uni_vmovups(Ymm(vec_idx), addr);
+    } else {
+        h->uni_vmovups(Zmm(vec_idx), addr);
+    }
+}
+
+size_t jit_emitter::aux_vecs_count() const {
+    return 0;
+}
+
+size_t jit_emitter::aux_gprs_count() const {
+    // We need one gpr to load table address
+    return entry_map_.empty() ? 0 : 1;
+}
+
+std::set<InferenceEngine::Precision> jit_emitter::get_supported_precisions() {
+    return {InferenceEngine::Precision::FP32};
+}
+
+void jit_emitter::emitter_preamble(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &pool_vec_idxs,
+                                   const std::vector<size_t> &pool_gpr_idxs) {
+    using namespace Xbyak::util;
+
+    for (auto idx : pool_vec_idxs)
+        aux_vec_idxs.push_back(idx);
+
+    // For sse42 mask register has to be Xmm(0)
+    if (host_isa_ == cpu::sse42 && aux_vecs_count() > 0) {
+        size_t idx = 0;
+        assert(std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) == in_vec_idxs.end());
+        if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) {
+            aux_vec_idxs.push_back(idx);
+            preserved_vec_idxs.push_back(idx);
+        }
+
+        // moving mask vector at the beginning of aux vectors list to simplify further processing
+        for (int i = 0; i < aux_vec_idxs.size(); i++) {
+            if (aux_vec_idxs[i] == 0) {
+                size_t tmp = aux_vec_idxs[0];
+                aux_vec_idxs[0] = aux_vec_idxs[i];
+                aux_vec_idxs[i] = tmp;
+                break;
+            }
+        }
+    }
+
+    for (size_t idx = 0; idx < get_max_vecs_count(); idx++) {
+        if (aux_vec_idxs.size() >= aux_vecs_count()) break;
+
+        if (std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) != in_vec_idxs.end()) continue;
+        if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue;
+        if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue;
+
+        aux_vec_idxs.push_back(idx);
+        preserved_vec_idxs.push_back(idx);
+    }
+    assert(aux_vec_idxs.size() >= aux_vecs_count());
+
+    // Same logic but to allocate gprs
+    for (auto idx : pool_gpr_idxs)
+        aux_gpr_idxs.push_back(idx);
+
+    for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) {
+        size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end
+
+        if (aux_gpr_idxs.size() >= aux_gprs_count()) break;
+        if (_idx == Operand::RSP) continue;
+        if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue;
+        if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue;
+
+        aux_gpr_idxs.push_back(_idx);
+        preserved_gpr_idxs.push_back(_idx);
+    }
+    assert(aux_gpr_idxs.size() == aux_gprs_count());
+
+    if (!entry_map_.empty()) {
+        p_table = Reg64(aux_gpr_idxs[0]);
+        aux_gpr_idxs.erase(aux_gpr_idxs.begin());
+    }
+
+    for (size_t i = 0; i < preserved_gpr_idxs.size(); ++i)
+        h->push(Reg64(preserved_gpr_idxs[i]));
+
+    if (preserved_vec_idxs.size())
+        h->sub(h->rsp, preserved_vec_idxs.size() * get_vec_length());
+
+    for (size_t i = 0; i < preserved_vec_idxs.size(); ++i) {
+        push_vec(h->ptr[h->rsp + i * get_vec_length()], preserved_vec_idxs[i]);
+    }
+
+    if (!entry_map_.empty())
+        load_table_addr();
+}
+
+
+void jit_emitter::emitter_postamble() {
+    using namespace Xbyak::util;
+
+    for (size_t i = 0; i < preserved_vec_idxs.size(); ++i)
+        pop_vec(preserved_vec_idxs[i], h->ptr[h->rsp + i * get_vec_length()]);
+
+    if (preserved_vec_idxs.size())
+        h->add(h->rsp, preserved_vec_idxs.size() * get_vec_length());
+
+    for (int i = aux_gprs_count() - 1; i >= 0; --i)
+        h->pop(Reg64(preserved_gpr_idxs[i]));
+
+    preserved_vec_idxs.clear();
+    preserved_gpr_idxs.clear();
+
+    aux_vec_idxs.clear();
+    aux_gpr_idxs.clear();
+}
+
+void jit_emitter::emit_table() {
+    h->align(64);
+    h->L(l_table);
+
+    // Assumption: entries can be inserted with dd, so they should be 4 bytes.
+    assert(sizeof(table_entry_val_t) == 4);
+
+    // Run through the map and insert values stored there
+    for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) {
+        const auto &te = (*it).second; // get map entry for a given key
+        const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t);
+        for (size_t d = 0; d < len; d += sizeof(table_entry_val_t))
+            h->dd(te.val);
+    }
+}
+
+void jit_emitter::prepare_table() {
+    register_table_entries();
+
+    // Now that we registered the entries, we set the offsets.  No
+    // entries should be registered after this point.  This allows to
+    // expect the same order when injecting the table entries in
+    // prepare_table.
+    size_t off = 0;
+    for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) {
+        auto &te = (*it).second;
+        te.off = off;
+        off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t);
+    }
+}
+
+void jit_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                       const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    emitter_preamble(in_vec_idxs, pool_vec_idxs, pool_gpr_idxs);
+
+    emit_impl(in_vec_idxs, out_vec_idxs, pool_vec_idxs, pool_gpr_idxs);
+
+    emitter_postamble();
+}
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h
new file mode 100644 (file)
index 0000000..53a1aef
--- /dev/null
@@ -0,0 +1,128 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include "jit_generator.hpp"
+#include "mkldnn_node.h"
+#include <set>
+
+namespace MKLDNNPlugin {
+
+class jit_emitter {
+public:
+    jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
+        : h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) {
+        k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well
+    }
+
+    virtual void emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                      const std::vector<size_t> &pool_vec_idxs = {}, const std::vector<size_t> &pool_gpr_idxs = {});
+    virtual void emit_table();
+    virtual size_t get_inputs_num() = 0;
+    virtual size_t aux_vecs_count() const;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();
+
+protected:
+    virtual size_t aux_gprs_count() const;
+
+    size_t get_max_vecs_count() const;
+    size_t get_vec_length() const;
+
+    const MKLDNNNode& n;
+    mkldnn::impl::cpu::jit_generator* h;
+    mkldnn::impl::cpu::cpu_isa_t host_isa_;
+    InferenceEngine::Precision exec_prc_;
+
+    Xbyak::Opmask k_mask;
+
+    virtual void prepare_table();
+    virtual void register_table_entries() {}
+
+    void load_table_addr() { h->mov(p_table, l_table); }
+
+    // we accept only 32bit hexadecimal table values to avoid any rounding
+    using table_entry_val_t = uint32_t;
+    using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table
+    using table_entry_bcast_t = bool; // true => bcast value
+
+    struct table_entry_t {
+        table_entry_val_t val;
+        table_entry_bcast_t bcast;
+    };
+    struct mapped_table_entry_t {
+        table_entry_offset_t off;
+        table_entry_val_t val;
+        table_entry_bcast_t bcast;
+    };
+
+    Xbyak::Reg64 p_table;
+    Xbyak::Label l_table;
+
+    enum {
+        _cmp_eq_oq = mkldnn::impl::cpu::jit_generator::_cmp_eq_oq,
+        _cmp_neq_uq = mkldnn::impl::cpu::jit_generator::_cmp_neq_uq,
+        _cmp_lt_os = mkldnn::impl::cpu::jit_generator::_cmp_lt_os,
+        _cmp_le_os = mkldnn::impl::cpu::jit_generator::_cmp_le_os,
+        _cmp_ge_os = mkldnn::impl::cpu::jit_generator::_cmp_nlt_us,
+        _cmp_gt_os = mkldnn::impl::cpu::jit_generator::_cmp_nle_us,
+    };
+
+    virtual void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                           const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {}
+
+    virtual void emitter_preamble(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &pool_vec_idxs,
+                          const std::vector<size_t> &pool_gpr_idxs);
+    virtual void emitter_postamble();
+
+    std::vector<size_t> aux_vec_idxs;
+    std::vector<size_t> aux_gpr_idxs;
+
+    static constexpr int k_mask_size = 8;
+
+    Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
+        auto off = table_off(key, key_off_val_shift);
+        return h->ptr[p_table + off];
+    }
+
+    using table_t = std::multimap<std::string, table_entry_t>;
+    using mapped_table_t = std::multimap<std::string, mapped_table_entry_t>;
+
+    mapped_table_t entry_map_;
+
+    void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) {
+        mapped_table_entry_t te {0, val, broadcast};
+        entry_map_.insert(std::make_pair(key, te));
+    }
+
+    void push_entries_of(const table_t &t) {
+        for (auto it = t.begin(); it != t.end(); it++) {
+            auto key = (*it).first;
+            auto te = (*it).second; // copy values from table
+            push_arg_entry_of(key, te.val, te.bcast);
+        }
+    }
+
+private:
+    std::vector<size_t> preserved_vec_idxs;
+    std::vector<size_t> preserved_gpr_idxs;
+
+    void push_vec(const Xbyak::Address &addr, size_t vec_idx) const;
+    void pop_vec(size_t vec_idx, const Xbyak::Address &addr) const;
+
+    size_t table_off(std::string& key, size_t key_off_val_shift = 0) const {
+        // assumption: all table entries sharing the same key also
+        // share their broadcast property
+        // TODO: enforce through data structure
+        const auto it = entry_map_.find(key); // search an entry for a key
+        assert(it != entry_map_.end());
+        const auto &te = (*it).second;
+        const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t);
+        return te.off + key_off_val_shift * scale;
+    }
+};
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp
new file mode 100644 (file)
index 0000000..aa5449b
--- /dev/null
@@ -0,0 +1,1417 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common/emitter.h"
+#include "jit_eltwise_emitters.hpp"
+#include "mkldnn_eltwise_node.h"
+#include "jit_uni_eltwise.hpp"
+#include "legacy/ie_layers.h"
+
+using namespace InferenceEngine;
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::cpu;
+using namespace Xbyak;
+
+namespace MKLDNNPlugin {
+
+/// ADD ///
+jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_add_emitter::get_inputs_num() { return 2; }
+
+void jit_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
+    } else {
+        h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1);
+    }
+}
+
+/// MUL_ADD ///
+jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_mul_add_emitter::get_inputs_num() { return 3; }
+
+void jit_mul_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                    const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_mul_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_src2 = Vmm(in_vec_idxs[2]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        h->uni_vmovups(vmm_dst, vmm_src0);
+        h->mulps(vmm_dst, vmm_src1);
+        h->addps(vmm_dst, vmm_src2);
+    } else {
+        Vmm vmm_mul0;
+        if (vmm_dst.getIdx() == vmm_src0.getIdx()) {
+            h->uni_vmovups(vmm_aux0, vmm_src0);
+            vmm_mul0 = vmm_aux0;
+        } else {
+            vmm_mul0 = vmm_src0;
+        }
+
+        Vmm vmm_mul1;
+        if (vmm_dst.getIdx() == vmm_src1.getIdx()) {
+            h->uni_vmovups(vmm_aux0, vmm_src1);
+            vmm_mul1 = vmm_aux0;
+        } else {
+            vmm_mul1 = vmm_src1;
+        }
+
+        if (vmm_dst.getIdx() != vmm_src2.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src2);
+        h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1);
+    }
+}
+
+size_t jit_mul_add_emitter::aux_vecs_count() const {
+    return 1;
+}
+
+/// SUB ///
+jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_subtract_emitter::get_inputs_num() { return 2; }
+
+void jit_subtract_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
+    } else {
+        h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
+    }
+}
+
+
+/// MULTIPLY ///
+jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_multiply_emitter::get_inputs_num() { return 2; }
+
+void jit_multiply_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
+    } else {
+        h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1);
+    }
+}
+
+
+/// DIVIDE ///
+jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_divide_emitter::get_inputs_num() { return 2; }
+
+void jit_divide_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vdivps(vmm_dst, vmm_dst, vmm_src1);
+    } else {
+        h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
+    }
+}
+
+
+/// FLOOR_MOD ///
+jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_floor_mod_emitter::get_inputs_num() { return 2; }
+
+void jit_floor_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_floor_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        if (vmm_dst.getIdx() != vmm_src0.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vmovups(vmm_aux0, vmm_src0);
+        h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1);
+        h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down
+        h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+        h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+    } else {
+        if (vmm_dst.getIdx() != vmm_src0.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down
+        h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+        h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+    }
+}
+
+size_t jit_floor_mod_emitter::aux_vecs_count() const {
+    return 1;
+}
+
+/// MOD ///
+jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_mod_emitter::get_inputs_num() { return 2; }
+
+void jit_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        if (vmm_dst.getIdx() != vmm_src0.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vmovups(vmm_aux0, vmm_src0);
+        h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1);
+        h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate
+        h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+        h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+    } else {
+        if (vmm_dst.getIdx() != vmm_src0.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate
+        h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+        h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+    }
+}
+
+size_t jit_mod_emitter::aux_vecs_count() const {
+    return 1;
+}
+
+/// MAXIMUM ///
+jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_maximum_emitter::get_inputs_num() { return 2; }
+
+void jit_maximum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_maximum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    auto uni_vmax = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); break;
+            case Precision::I32:  h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
+    if (isa == cpu::sse42) {
+        if (vmm_src0.getIdx() != vmm_dst.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        uni_vmax(vmm_dst, vmm_dst, vmm_src1);
+    } else {
+        uni_vmax(vmm_dst, vmm_src0, vmm_src1);
+    }
+}
+
+std::set<InferenceEngine::Precision> jit_maximum_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}
+
+/// MINIMUM ///
+jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_minimum_emitter::get_inputs_num() { return 2; }
+
+void jit_minimum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_minimum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    auto uni_vmin = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); break;
+            case Precision::I32:  h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
+    if (isa == cpu::sse42) {
+        if (vmm_src0.getIdx() != vmm_dst.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        uni_vmin(vmm_dst, vmm_dst, vmm_src1);
+    } else {
+        uni_vmin(vmm_dst, vmm_src0, vmm_src1);
+    }
+}
+
+std::set<InferenceEngine::Precision> jit_minimum_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}
+
+/// SQUARED_DIFFERENCE ///
+jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_squared_difference_emitter::get_inputs_num() { return 2; }
+
+void jit_squared_difference_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == cpu::sse42) {
+        if (vmm_src0.getIdx() != vmm_dst.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+        h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
+        h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+    } else {
+        h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
+        h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+    }
+}
+
+
+/// POWER_DYNAMIC ///
+jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; }
+
+void jit_power_dynamic_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    Xmm xmm0 = Xmm(0), xmm1 = Xmm(1);
+
+    // caller obligation to save gprs as callee may use them
+    size_t gpr_size = 8;
+    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
+                                     h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
+    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+    h->sub(h->rsp, n_gprs_to_save * gpr_size);
+    for (size_t i = 0; i < n_gprs_to_save; ++i)
+        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+    // caller obligation to save k-regs as callee may use them
+    size_t n_k_regs_to_save = 8;
+    if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+        h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
+        for (size_t i = 0; i < n_k_regs_to_save; ++i) {
+            if (mayiuse(avx512_core))
+                h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+            else
+                h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+        }
+    }
+
+    // 1. Caller obligation to save vector registers as callee may use them.
+    // 2. Additionally save space for vmm_src, to put the answer in-place on
+    // this space and space for beta.
+    // 3. There is an implicit assumption that the host code uses the same
+    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+    // `vlen` should be replaced with `host_isa::vlen` and
+    // `host_isa::vecs_count`.
+    h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+    for (size_t i = 2; i < get_max_vecs_count() + 2; ++i)
+        h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2));
+    h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src
+    h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta
+
+    // save function address in gpr to pass in in call instruction
+    h->mov(h->rbp, reinterpret_cast<uintptr_t>(powf));
+
+    // align stack on 16-byte as ABI requires
+    h->mov(h->rbx, h->rsp);
+    h->and_(h->rbx, 0xf);
+    h->sub(h->rsp, h->rbx);
+
+    // Take src, apply powf on it and replace value on a stack with dst.
+    for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) {
+        const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)];
+        h->uni_vmovss(xmm0, source);
+        h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]);
+        h->call(h->rbp);
+        h->uni_vmovss(source, xmm0);
+    }
+
+    h->add(h->rsp, h->rbx);
+
+    // restore vector registers
+    for (size_t i = get_max_vecs_count() + 1; i >= 2; --i)
+        h->uni_vmovups(Vmm(i - 2), h->ptr[h->rsp + i * get_vec_length()]);
+    h->uni_vmovups(vmm_dst, h->ptr[h->rsp + 0 * get_vec_length()]);
+    h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+
+    // restore k registers
+    if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+        for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
+            if (mayiuse(avx512_core))
+                h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+            else
+                h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+        }
+        h->add(h->rsp, n_k_regs_to_save * k_mask_size);
+    }
+
+    // restore gpr registers
+    for (int i = n_gprs_to_save - 1; i >= 0; --i)
+        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+    h->add(h->rsp, n_gprs_to_save * gpr_size);
+}
+
+
+/// EQUAL ///
+jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->movups(vmm_aux0, vmm_src0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+        h->movups(vmm_aux1, table_val("one"));
+        h->pxor(vmm_dst, vmm_dst);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+    }
+}
+
+void jit_equal_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_equal_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// NOT_EQUAL ///
+jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_not_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_not_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_not_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->movups(vmm_aux0, vmm_src0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+        h->movups(vmm_dst, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vmovups(vmm_dst, table_val("one"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq);
+        h->uni_vmovups(vmm_dst, table_val("one"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("zero"));
+    }
+}
+
+void jit_not_equal_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_not_equal_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// GREATER ///
+jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_greater_emitter::get_inputs_num() { return 2; }
+
+void jit_greater_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_greater_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->movups(vmm_aux0, vmm_src0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os);
+        h->movups(vmm_aux1, table_val("one"));
+        h->pxor(vmm_dst, vmm_dst);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_gt_os);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+    }
+}
+
+void jit_greater_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_greater_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// GREATER_EQUAL ///
+jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_greater_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_greater_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_greater_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->movups(vmm_aux0, vmm_src0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os);
+        h->movups(vmm_aux1, table_val("one"));
+        h->pxor(vmm_dst, vmm_dst);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_ge_os);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+    }
+}
+
+void jit_greater_equal_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_greater_equal_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// LESS ///
+jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_less_emitter::get_inputs_num() { return 2; }
+
+void jit_less_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_less_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->movups(vmm_aux0, vmm_src0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os);
+        h->movups(vmm_aux1, table_val("one"));
+        h->pxor(vmm_dst, vmm_dst);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_lt_os);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+    }
+}
+
+void jit_less_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_less_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// LESS_EQUAL ///
+jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_less_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_less_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_less_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->movups(vmm_aux0, vmm_src0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os);
+        h->movups(vmm_aux1, table_val("one"));
+        h->pxor(vmm_dst, vmm_dst);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_le_os);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+    }
+}
+
+void jit_less_equal_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_less_equal_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// LOGICAL_AND ///
+jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_logical_and_emitter::get_inputs_num() { return 2; }
+
+void jit_logical_and_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+    Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
+
+    if (isa == cpu::sse42) {
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+        h->movups(vmm_dst, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_dst, vmm_aux1);
+
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+        h->movups(vmm_aux2, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_aux2, vmm_aux1);
+
+        h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2);
+    } else if (isa == cpu::avx2) {
+        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+        h->uni_vmovups(vmm_dst, table_val("one"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+
+        h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero"));
+        h->uni_vmovups(vmm_aux0, table_val("one"));
+        h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1);
+
+        h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+        h->uni_vmovups(vmm_aux0, table_val("one"));
+        h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero"));
+
+        h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq);
+        h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero"));
+
+        h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0);
+    }
+}
+
+void jit_logical_and_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_and_emitter::aux_vecs_count() const {
+    return 3;
+}
+
+
+/// LOGICAL_OR ///
+jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_logical_or_emitter::get_inputs_num() { return 2; }
+
+void jit_logical_or_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+    Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
+
+    if (isa == cpu::sse42) {
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+        h->movups(vmm_dst, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_dst, vmm_aux1);
+
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+        h->movups(vmm_aux2, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_aux2, vmm_aux1);
+
+        h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2);
+    } else if (isa == cpu::avx2) {
+        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+        h->uni_vmovups(vmm_dst, table_val("one"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+
+        h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero"));
+        h->uni_vmovups(vmm_aux0, table_val("one"));
+        h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1);
+
+        h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+        h->uni_vmovups(vmm_aux0, table_val("one"));
+        h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero"));
+
+        h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq);
+        h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero"));
+
+        h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0);
+    }
+}
+
+void jit_logical_or_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_or_emitter::aux_vecs_count() const {
+    return 3;
+}
+
+/// LOGICAL_XOR ///
+jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_logical_xor_emitter::get_inputs_num() { return 2; }
+
+void jit_logical_xor_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+    Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
+
+    if (isa == cpu::sse42) {
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+        h->movups(vmm_dst, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_dst, vmm_aux1);
+
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+        h->movups(vmm_aux2, table_val("one"));
+        h->pxor(vmm_aux1, vmm_aux1);
+        h->blendvps(vmm_aux2, vmm_aux1);
+
+        h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2);
+    } else if (isa == cpu::avx2) {
+        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+        h->uni_vmovups(vmm_dst, table_val("one"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+
+        h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero"));
+        h->uni_vmovups(vmm_aux0, table_val("one"));
+        h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1);
+
+        h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+        h->uni_vmovups(vmm_aux0, table_val("one"));
+        h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero"));
+
+        h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq);
+        h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero"));
+
+        h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0);
+    }
+}
+
+void jit_logical_xor_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_xor_emitter::aux_vecs_count() const {
+    return 3;
+}
+
+/// LOGICAL_NOT ///
+jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_logical_not_emitter::get_inputs_num() { return 1; }
+
+void jit_logical_not_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_not_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+        h->movups(vmm_aux1, table_val("one"));
+        h->pxor(vmm_dst, vmm_dst);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+    } else {
+        h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+        h->uni_vmovups(vmm_dst, table_val("zero"));
+        h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+    }
+}
+
+void jit_logical_not_emitter::register_table_entries() {
+    push_arg_entry_of("zero", 0x00000000, true);
+    push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_not_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+/// POWER_STATIC ///
+jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_power_static_emitter::get_inputs_num() { return 1; }
+
+void jit_power_static_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+    auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
+    if (powerLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot convert power layer.";
+
+    float power = powerLayer->power;
+    float scale = powerLayer->scale;
+    float shift = powerLayer->offset;
+
+    Xmm xmm0 = Xmm(0), xmm1 = Xmm(1);
+
+    if (scale != 1.f || shift != 0.f) {
+        if (isa == cpu::sse42) {
+            h->uni_vmovups(vmm_aux0, table_val("scale"));
+            h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0);
+            h->uni_vmovups(vmm_dst, table_val("shift"));
+            h->uni_vaddps(vmm_dst, vmm_dst, vmm_aux0);
+        } else {
+            if (vmm_dst.getIdx() != vmm_src0.getIdx()) {
+                h->uni_vmovups(vmm_dst, table_val("shift"));
+                h->uni_vfmadd231ps(vmm_dst, vmm_src0, table_val("scale"));
+            } else {
+                h->uni_vmovups(vmm_aux0, table_val("shift"));
+                h->uni_vfmadd231ps(vmm_aux0, vmm_src0, table_val("scale"));
+                h->uni_vmovups(vmm_dst, vmm_aux0);
+            }
+        }
+    } else {
+        if (vmm_dst.getIdx() != vmm_src0.getIdx())
+            h->uni_vmovups(vmm_dst, vmm_src0);
+    }
+
+    if (power == 1.f) {
+    } else if (power == 0.5f || power == -0.5f) {
+        h->uni_vsqrtps(vmm_dst, vmm_dst);
+
+        if (power < 0.f) {
+            h->uni_vmovups(vmm_aux0, table_val("one"));
+            if (isa == cpu::sse42) {
+                h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
+                h->uni_vmovups(vmm_dst, vmm_aux0);
+            } else {
+                h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst);
+            }
+        }
+    } else if (std::floor(power) == power && power != 0) {
+        int ipower = std::abs(static_cast<int>(power));
+        h->uni_vmovups(vmm_aux0, vmm_dst);
+        for (int i = 1; i < ipower; i++) {
+            h->uni_vmulps(vmm_dst, vmm_dst, vmm_aux0);
+        }
+
+        if (power < 0.f) {
+            h->uni_vmovups(vmm_aux0, table_val("one"));
+            if (isa == cpu::sse42) {
+                h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
+                h->uni_vmovups(vmm_dst, vmm_aux0);
+            } else {
+                h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst);
+            }
+        }
+    } else {
+        h->uni_vmovups(vmm_aux0, table_val("power"));
+
+        // caller obligation to save gprs as callee may use them
+        size_t gpr_size = 8;
+        Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
+                                         h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
+        size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+        h->sub(h->rsp, n_gprs_to_save * gpr_size);
+        for (size_t i = 0; i < n_gprs_to_save; ++i)
+            h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+        // caller obligation to save k-regs as callee may use them
+        size_t n_k_regs_to_save = 8;
+        if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+            h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
+            for (size_t i = 0; i < n_k_regs_to_save; ++i) {
+                if (mayiuse(avx512_core))
+                    h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+                else
+                    h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+            }
+        }
+
+        // 1. Caller obligation to save vector registers as callee may use them.
+        // 2. Additionally save space for vmm_src, to put the answer in-place on
+        // this space and space for beta.
+        // 3. There is an implicit assumption that the host code uses the same
+        // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+        // `vlen` should be replaced with `host_isa::vlen` and
+        // `host_isa::vecs_count`.
+        h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+        for (size_t i = 2; i < get_max_vecs_count() + 2; ++i)
+            h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2));
+        h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src
+        h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta
+
+        // save function address in gpr to pass in in call instruction
+        h->mov(h->rbp, reinterpret_cast<uintptr_t>(powf));
+
+        // align stack on 16-byte as ABI requires
+        h->mov(h->rbx, h->rsp);
+        h->and_(h->rbx, 0xf);
+        h->sub(h->rsp, h->rbx);
+
+        // Take src, apply powf on it and replace value on a stack with dst.
+        for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) {
+            const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)];
+            h->uni_vmovss(xmm0, source);
+            h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]);
+            h->call(h->rbp);
+            h->uni_vmovss(source, xmm0);
+        }
+
+        h->add(h->rsp, h->rbx);
+
+        // restore vector registers
+        for (size_t i = get_max_vecs_count() + 1; i >= 2; --i)
+            h->uni_vmovups(Vmm(i - 2), h->ptr[h->rsp + i * get_vec_length()]);
+        h->uni_vmovups(vmm_dst, h->ptr[h->rsp + 0 * get_vec_length()]);
+        h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+
+        // restore k registers
+        if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+            for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
+                if (mayiuse(avx512_core))
+                    h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+                else
+                    h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+            }
+            h->add(h->rsp, n_k_regs_to_save * k_mask_size);
+        }
+
+        // restore gpr registers
+        for (int i = n_gprs_to_save - 1; i >= 0; --i)
+            h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+        h->add(h->rsp, n_gprs_to_save * gpr_size);
+    }
+}
+
+void jit_power_static_emitter::register_table_entries() {
+    auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
+    if (powerLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot convert power layer.";
+
+    float power_ = powerLayer->power;
+    float scale_ = powerLayer->scale;
+    float shift_ = powerLayer->offset;
+
+    push_arg_entry_of("power", float2int(power_), true);
+    push_arg_entry_of("scale", float2int(scale_), true);
+    push_arg_entry_of("shift", float2int(shift_), true);
+    push_arg_entry_of("one",   float2int(1.f), true);
+}
+
+size_t jit_power_static_emitter::aux_vecs_count() const {
+    return 1;
+}
+
+/// PRELU ///
+jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+    prepare_table();
+}
+
+size_t jit_prelu_emitter::get_inputs_num() { return 2; }
+
+void jit_prelu_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx2) {
+        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::avx512_common) {
+        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_prelu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+    if (isa == cpu::sse42) {
+        h->pxor(vmm_aux0, vmm_aux0);
+        h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os);
+        h->movups(vmm_aux1, vmm_src1);
+        h->mulps(vmm_aux1, vmm_src0);
+        if (vmm_src0.getIdx() != vmm_dst.getIdx())
+            h->movups(vmm_dst, vmm_src0);
+        h->blendvps(vmm_dst, vmm_aux1);
+    } else if (isa == cpu::avx2) {
+        h->vmulps(vmm_aux0, vmm_src0, vmm_src1);
+        h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1);
+        h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1);
+        h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1);
+    } else if (isa == cpu::avx512_common) {
+        h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0);
+        if (vmm_src0.getIdx() != vmm_dst.getIdx())
+            h->vmovups(vmm_dst, vmm_src0);
+        h->vcmpps(k_mask, vmm_src0, vmm_aux0, _cmp_lt_os);
+        h->vmulps(vmm_dst | k_mask, vmm_src0, vmm_src1);
+    }
+}
+
+size_t jit_prelu_emitter::aux_vecs_count() const {
+    return 2;
+}
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp
new file mode 100644 (file)
index 0000000..baa3fd8
--- /dev/null
@@ -0,0 +1,417 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "common/emitter.h"
+#include "jit_generator.hpp"
+#include "mkldnn_node.h"
+
+namespace MKLDNNPlugin {
+
+class jit_add_emitter : public jit_emitter {
+public:
+    jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                    InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+class jit_mul_add_emitter : public jit_emitter {
+public:
+    jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_subtract_emitter : public jit_emitter {
+public:
+    jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                         InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_multiply_emitter : public jit_emitter {
+public:
+    jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                         InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_divide_emitter : public jit_emitter {
+public:
+    jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_floor_mod_emitter : public jit_emitter {
+public:
+    jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                          InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_mod_emitter : public jit_emitter {
+public:
+    jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                    InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_maximum_emitter : public jit_emitter {
+public:
+    jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_minimum_emitter : public jit_emitter {
+public:
+    jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_squared_difference_emitter : public jit_emitter {
+public:
+    jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                                   InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_power_dynamic_emitter : public jit_emitter {
+public:
+    jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                              InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_equal_emitter : public jit_emitter {
+public:
+    jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                      InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_not_equal_emitter : public jit_emitter {
+public:
+    jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                          InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_greater_emitter : public jit_emitter {
+public:
+    jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_greater_equal_emitter : public jit_emitter {
+public:
+    jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                              InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_less_emitter : public jit_emitter {
+public:
+    jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                     InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_less_equal_emitter : public jit_emitter {
+public:
+    jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                           InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_logical_and_emitter : public jit_emitter {
+public:
+    jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                            InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_logical_or_emitter : public jit_emitter {
+public:
+    jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                           InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+
+class jit_logical_xor_emitter : public jit_emitter {
+public:
+    jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                            InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+class jit_logical_not_emitter : public jit_emitter {
+public:
+    jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                            InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+class jit_power_static_emitter : public jit_emitter {
+public:
+    jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                             InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    void register_table_entries() override;
+    size_t aux_vecs_count() const override;
+};
+
+class jit_prelu_emitter : public jit_emitter {
+public:
+    jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                      InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+    size_t aux_vecs_count() const override;
+};
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp
new file mode 100644 (file)
index 0000000..9be8fd9
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common/emitter.h"
+#include "jit_mkldnn_emitters.hpp"
+#include "mkldnn_eltwise_node.h"
+#include "legacy/ie_layers.h"
+
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::cpu;
+using namespace Xbyak;
+
+namespace MKLDNNPlugin {
+
+jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, InferenceEngine::Precision exec_prc)
+    : jit_emitter(host, host_isa, node, exec_prc) {
+    auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(n);
+
+    auto alg = static_cast<mkldnn_alg_kind_t>(eltwiseNode.getAlgorithm());
+
+    if (host_isa_ == cpu::sse42) {
+        eltwise_injector_sse42 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::sse42>>(
+                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+    } else if (host_isa_ == cpu::avx2) {
+        eltwise_injector_avx2 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx2>>(
+                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+    } else if (host_isa_ == cpu::avx512_common) {
+        eltwise_injector_avx512_common = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx512_common>>(
+                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+size_t jit_mkldnn_emitter::get_inputs_num() { return 1; }
+
+void jit_mkldnn_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                              const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+    if (host_isa_ == cpu::sse42) {
+        if (out_vec_idxs[0] != in_vec_idxs[0])
+            h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0]));
+        eltwise_injector_sse42->compute_vector(out_vec_idxs[0]);
+    } else if (host_isa_ == cpu::avx2) {
+        if (out_vec_idxs[0] != in_vec_idxs[0])
+            h->uni_vmovups(Ymm(out_vec_idxs[0]), Ymm(in_vec_idxs[0]));
+        eltwise_injector_avx2->compute_vector(out_vec_idxs[0]);
+    } else if (host_isa_ == cpu::avx512_common) {
+        if (out_vec_idxs[0] != in_vec_idxs[0])
+            h->uni_vmovups(Zmm(out_vec_idxs[0]), Zmm(in_vec_idxs[0]));
+        eltwise_injector_avx512_common->compute_vector(out_vec_idxs[0]);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+void jit_mkldnn_emitter::emit_table() {
+    if (host_isa_ == cpu::sse42) {
+        eltwise_injector_sse42->prepare_table();
+    } else if (host_isa_ == cpu::avx2) {
+        eltwise_injector_avx2->prepare_table();
+    } else if (host_isa_ == cpu::avx512_common) {
+        eltwise_injector_avx512_common->prepare_table();
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp
new file mode 100644 (file)
index 0000000..cfd4039
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "common/emitter.h"
+#include "jit_generator.hpp"
+#include "mkldnn_node.h"
+#include "jit_uni_eltwise.hpp"
+
+namespace MKLDNNPlugin {
+
+class jit_mkldnn_emitter : public jit_emitter {
+public:
+    jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() override;
+
+    void emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+              const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+    void emit_table() override;
+
+private:
+    std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::sse42>> eltwise_injector_sse42;
+    std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx2>> eltwise_injector_avx2;
+    std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx512_common>> eltwise_injector_avx512_common;
+};
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
deleted file mode 100644 (file)
index 144f8d9..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_activation_node.h"
-#include "desc_iterator.hpp"
-#include <legacy/ie_layers.h>
-#include <algorithm>
-#include <string>
-#include <mkldnn_extension_utils.h>
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-using namespace InferenceEngine::details;
-
-// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu
-caseless_map<std::string, std::function<void(GenericLayer*, mkldnn::algorithm&, float&, float&)>> MKLDNNActivationNode::initializers = {
-        {"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
-            beta = 0.0f;
-            algorithm = eltwise_relu;
-        }},
-        {"gelu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_gelu;
-        }},
-        {"elu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
-            beta = 0.0f;
-            algorithm = eltwise_elu;
-        }},
-        {"tanh", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_tanh;
-        }},
-        {"logistic", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_logistic;
-        }},
-        {"square", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_square;
-        }},
-        {"abs", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_abs;
-        }},
-        {"sqrt", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_sqrt;
-        }},
-        {"linear", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
-            beta = activationLayer->GetParamAsFloat("beta", 0.0f);
-            algorithm = eltwise_linear;
-        }},
-        {"bounded_relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("alpha", 0.0f);
-            beta = 0.0f;
-            algorithm = eltwise_bounded_relu;
-        }},
-        {"soft_relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_soft_relu;
-        }},
-        {"relu6", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("n", 6.0f);
-            beta = 0.0f;
-            algorithm = eltwise_bounded_relu;
-        }},
-        {"clamp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("max", 1.0f);
-            beta = activationLayer->GetParamAsFloat("min", 0.0f);
-            algorithm = eltwise_clamp;
-        }},
-        {"exp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_exp;
-        }},
-        {"not", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_not;
-        }},
-        {"swish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
-            beta = 0.0f;
-            algorithm = eltwise_swish;
-        }},
-        {"hswish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_hswish;
-        }},
-        {"mish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_mish;
-        }},
-        {"hsigmoid", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = 0.0f;
-            beta = 0.0f;
-            algorithm = eltwise_hsigmoid;
-        }},
-};
-
-MKLDNNActivationNode::MKLDNNActivationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
-        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(layer, eng, cache) {
-    GenericLayer* activationLayer = getCnnLayer().get();
-    if (activationLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot get CNNLayer.";
-
-    std::string type = activationLayer->type;
-    CaselessEq<std::string> comparator;
-    if (comparator(type, "activation"))
-        type = activationLayer->GetParamAsString("type");
-    if (comparator(type, "sigmoid"))
-        type = "logistic";
-
-    if (initializers.find(type) != initializers.end())
-        initializers[type](activationLayer, algorithm, alpha, beta);
-}
-
-void MKLDNNActivationNode::getSupportedDescriptors() {
-    if (!descs.empty())
-        return;
-
-    if (getParentEdges().size() != 1)
-        THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
-    if (!getChildEdges().size())
-        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
-
-    auto parentOutDims = getParentEdgeAt(0)->getDims();
-
-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-
-    // FIXME: MKLDNN doesn't support not inputs with number of dimensions less than 4 for activation
-    while (parentOutDims.ndims() < 4)
-        parentOutDims.push_back(1);
-    for (auto format : getAvailableFormatsForDims(parentOutDims)) {
-        MKLDNNMemoryDesc in_candidate(parentOutDims, MKLDNNExtensionUtils::IEPrecisionToDataType(precision), format);
-        createDescriptor({in_candidate}, {});
-    }
-}
-
-void MKLDNNActivationNode::createPrimitive() {
-    if (prim)
-        return;
-
-    auto prim_desc = createPrimitiveDescriptor<eltwise_forward::primitive_desc, eltwise_forward::desc>();
-
-    prim.reset(new eltwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                getChildEdgeAt(0)->getMemory().GetPrimitive()));
-}
-
-bool MKLDNNActivationNode::created() const {
-    return getType() == Activation;
-}
-
-void MKLDNNActivationNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
-                                            const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
-    MKLDNNMemoryDesc inDesc(inputDesc[0]);
-    MKLDNNDescriptor desc(std::shared_ptr<eltwise_forward::desc>(
-            new eltwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), inDesc, getAlpha(), getBeta())));
-    descs.push_back(desc);
-}
-
-void MKLDNNActivationNode::initOptimalPrimitiveDescriptor() {
-    auto config = getSelectedPrimitiveDescriptor()->getConfig();
-    if (isInitConfig(config))
-        return;
-
-    if (config.inConfs.size() != 1 || config.outConfs.size() != 1 ||
-            (!isUninitTensorDesc(config.inConfs[0].desc) &&
-                    !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc))
-        THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!";
-
-    if (!isUninitTensorDesc(config.inConfs[0].desc)) {
-        config.outConfs[0].desc = config.inConfs[0].desc;
-    } else if (!isUninitTensorDesc(config.outConfs[0].desc)) {
-        config.inConfs[0].desc = config.outConfs[0].desc;
-    } else {
-        config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0);
-    }
-
-    initDescriptor(config);
-}
-
-MKLDNNMemoryDesc MKLDNNActivationNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
-
-    auto parentOutDims = getParentEdgeAt(idx)->getDims().ToSizeVector();
-
-    SizeVector blocked_dims, order, dimOffsets, strides;
-    size_t offset = desc.getBlockingDesc().getOffsetPadding();
-
-    for (size_t i = 0; i < desc.getBlockingDesc().getStrides().size(); i++) {
-        if (desc.getBlockingDesc().getOrder()[i] >= parentOutDims.size())
-            continue;
-
-        blocked_dims.push_back(desc.getBlockingDesc().getBlockDims()[i]);
-        order.push_back(desc.getBlockingDesc().getOrder()[i]);
-        dimOffsets.push_back(desc.getBlockingDesc().getOffsetPaddingToData()[i]);
-        strides.push_back(desc.getBlockingDesc().getStrides()[i]);
-    }
-    if (desc.getLayout() == InferenceEngine::Layout::ANY)
-        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
-                                                            parentOutDims,
-                                                            desc.getLayout()));
-    else
-        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
-                                                            parentOutDims,
-                                                            {blocked_dims, order, offset, dimOffsets, strides}));
-}
-
-MKLDNNMemoryDesc MKLDNNActivationNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc());
-
-    auto childInDims = getChildEdgeAt(idx)->getDims().ToSizeVector();
-
-    SizeVector blocked_dims, order, dimOffsets, strides;
-    size_t offset = desc.getBlockingDesc().getOffsetPadding();
-
-    for (size_t i = 0; i < desc.getBlockingDesc().getStrides().size(); i++) {
-        if (desc.getBlockingDesc().getOrder()[i] >= childInDims.size())
-            continue;
-
-        blocked_dims.push_back(desc.getBlockingDesc().getBlockDims()[i]);
-        order.push_back(desc.getBlockingDesc().getOrder()[i]);
-        dimOffsets.push_back(desc.getBlockingDesc().getOffsetPaddingToData()[i]);
-        strides.push_back(desc.getBlockingDesc().getStrides()[i]);
-    }
-    if (desc.getLayout() == InferenceEngine::Layout::ANY)
-        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
-                                                            childInDims,
-                                                            desc.getLayout()));
-    else
-        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
-                                                            childInDims,
-                                                            {blocked_dims, order, offset, dimOffsets, strides}));
-}
-
-REG_MKLDNN_PRIM_FOR(MKLDNNActivationNode, Activation);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
deleted file mode 100644 (file)
index 997d4a8..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include "caseless.hpp"
-#include <string>
-#include <memory>
-#include <vector>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNActivationNode : public MKLDNNNode {
-public:
-    MKLDNNActivationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
-    ~MKLDNNActivationNode() override = default;
-
-    void getSupportedDescriptors() override;
-    void initOptimalPrimitiveDescriptor() override;
-    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
-                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
-    void createPrimitive() override;
-    bool created() const override;
-
-    mkldnn::algorithm getAlgorithm() const { return algorithm; }
-    float getAlpha() const { return alpha; }
-    float getBeta() const { return beta; }
-
-    MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
-    MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
-
-private:
-    float alpha = 0.0f;
-    float beta = 0.0f;
-    static InferenceEngine::details::caseless_map<std::string,
-            std::function<void(InferenceEngine::GenericLayer*, mkldnn::algorithm&, float&, float&)>> initializers;
-    mkldnn::algorithm algorithm = mkldnn::algorithm::eltwise_relu;
-};
-
-}  // namespace MKLDNNPlugin
-
index c632f82..09a49ba 100644 (file)
@@ -3,7 +3,6 @@
 //
 
 #include "mkldnn_batchnorm_node.h"
-#include "mkldnn_depthwise_node.h"
 #include <mkldnn_extension_utils.h>
 #include "common/cpu_memcpy.h"
 
index 9d46ef8..9237bdb 100644 (file)
@@ -25,7 +25,7 @@ public:
                           const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
     void createPrimitive() override;
     bool created() const override;
-    bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Depthwise
+    bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Eltwise
                                         && fusedWith[0]->getCnnLayer()->type == "ScaleShift";}
 
     MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
index e92a898..d309be0 100644 (file)
@@ -5,10 +5,8 @@
 #include "mkldnn_bin_conv_node.h"
 #include "mkldnn_reorder_node.h"
 #include "mkldnn_input_node.h"
-#include "mkldnn_activation_node.h"
-#include "desc_iterator.hpp"
 #include "mkldnn_eltwise_node.h"
-#include "mkldnn_depthwise_node.h"
+#include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
 #include "mkldnn_conv_node.h"
 #include <legacy/ie_layers.h>
@@ -116,7 +114,6 @@ void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() {
         paddingR[i] = (dst - calc_dst) * stride[i];
     }
 
-    withSum = isFusedWith(Eltwise);
     withDWConv = isFusedWith(Convolution);
     withBinarization = isFusedWith(Quantize);
     for (auto &node : fusedWith) {
@@ -138,12 +135,19 @@ void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() {
 #endif
     }
 
-    int expectedInputEdgesNum = baseInputsNumber + isFusedWith(Eltwise);
+    withSum = false;
+    int expectedInputEdgesNum = baseInputsNumber;
     for (int i = 0; i < fusedWith.size(); i++) {
         auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
         if (convolutionNode) {
             expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
         }
+
+        auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
+        if (eltwiseNode && eltwiseNode->isSum()) {
+            withSum = true;
+            expectedInputEdgesNum++;
+        }
     }
 
     if (getParentEdges().size() != expectedInputEdgesNum)
@@ -164,88 +168,13 @@ void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool
     for (auto &node : fusedWith) {
 #if defined (COMPILED_CPU_MKLDNN_ELTWISE_NODE)
         auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
-            if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
-                auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
-                if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
-                    // currently there is the only one scale while we need scale by channel :(
-                    ops.append_sum(it->second->buffer().as<float*>()[0]);
-                }
-            } else {
-                ops.append_sum(1.0);
-            }
+        if (eltwiseNode && eltwiseNode->isSum()) {
+            ops.append_sum(1.0);
             continue;
         }
-#endif
-
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(),
-                               activationNode->getBeta());
-            continue;
-        }
-#endif
-
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
-            auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-
-            if (initWeights) {
-                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});
-
-                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-                PostOpsIntBlobMemory[blob_idx]->FillZero();
-
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                             depthwiseLayer->_weights->buffer(),
-                                                             depthwiseLayer->_weights->size() *
-                                                             MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                if (depthwiseNode->isBroadcast()) {
-                    float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
-                    for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
-                    }
-                }
-
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
-                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
-                                                                memory::format::x);
-                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
-                                                                 depthwiseLayer->_biases->buffer(),
-                                                                 depthwiseLayer->_biases->size() *
-                                                                 MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                    if (depthwiseNode->isBroadcast()) {
-                        float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
-                        for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
-                        }
-                    }
-
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
-                    blob_idx += 2;
-                } else {
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         nullptr);
-
-                    blob_idx += 1;
-                }
-            } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                     nullptr,
-                                     nullptr);
-            }
 
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
             continue;
         }
 #endif
index c639fbf..be5fb61 100644 (file)
@@ -20,6 +20,7 @@
 #include "mkldnn_conv_node.h"
 #include "mkldnn_quantize_node.h"
 #include "mkldnn_pooling_node.h"
+#include "mkldnn_eltwise_node.h"
 #include <limits>
 #include "common/cpu_memcpy.h"
 
@@ -93,12 +94,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
     MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
     InferenceEngine::LayerConfig config;
     config.dynBatchSupport = true;
-    bool hasEltwise = false;
 
     for (size_t i = 0; i < getParentEdges().size(); i++) {
         auto parentEdge = getParentEdgeAt(i);
-        if (parentEdge->getParent()->getType() == Eltwise)
-            hasEltwise = true;
 
         InferenceEngine::DataConfig dataConfig;
         dataConfig.inPlace = -1;
@@ -117,7 +115,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
     config.outConfs.resize(1);
     config.outConfs[0].inPlace = -1;
     config.outConfs[0].constant = false;
-    if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1 || hasEltwise) {
+    if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
         auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format::nc :
                                                                                           dims.ndims() == 4 ? memory::format::nhwc :
                                                                                                               memory::format::ndhwc
@@ -155,7 +153,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
         }
     }
 
-    if (axis != 1 || hasEltwise)
+    if (axis != 1)
         return;
 
     auto numOfDim = static_cast<size_t>(dstDims.ndims());
index 83d4862..ff40438 100644 (file)
@@ -5,10 +5,8 @@
 #include "mkldnn_conv_node.h"
 #include "mkldnn_reorder_node.h"
 #include "mkldnn_input_node.h"
-#include "mkldnn_activation_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_eltwise_node.h"
-#include "mkldnn_depthwise_node.h"
 #include "mkldnn_quantize_node.h"
 #include "mkldnn_pooling_node.h"
 #include "mkldnn_concat_node.h"
@@ -110,6 +108,21 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
     if (convLayer == nullptr)
         THROW_IE_EXCEPTION << "Cannot convert convolution layer.";
 
+    withSum = false;
+    int expectedInputEdgesNum = baseInputsNumber;
+    for (int i = 0; i < fusedWith.size(); i++) {
+        auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
+        if (convolutionNode) {
+            expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
+        }
+
+        auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
+        if (eltwiseNode && eltwiseNode->isSum()) {
+            withSum = true;
+            expectedInputEdgesNum++;
+        }
+    }
+
     auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
     if (!inputZeroPoints.empty())
         inputDataType = memory::u8;
@@ -127,10 +140,10 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
 
         // We need to make sure that convolution output and second input of fused Eltwise operation
         // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
-        if (outputDataType != memory::f32 && outputDataType != memory::bf16 && isFusedWith(Eltwise)) {
+        if (outputDataType != memory::f32 && outputDataType != memory::bf16 && withSum) {
             for (int i = 0; i < fusedWith.size(); i++) {
                 auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
-                if (eltwiseNode) {
+                if (eltwiseNode && eltwiseNode->isSum()) {
                     eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
                     if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
                         eltwisePrecision = Precision::FP32;
@@ -142,14 +155,6 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         }
     }
 
-    int expectedInputEdgesNum = baseInputsNumber + isFusedWith(Eltwise);
-    for (int i = 0; i < fusedWith.size(); i++) {
-        auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
-        if (convolutionNode) {
-            expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
-        }
-    }
-
     if (getParentEdges().size() != expectedInputEdgesNum)
         THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
     if (getChildEdges().empty())
@@ -232,7 +237,6 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
 
     MKLDNNDims weightsDims = MKLDNNDims(weightDims);
 
-    withSum = isFusedWith(Eltwise);
     withDWConv = isFusedWith(Convolution);
 
     for (int i = 0; i < fusedWith.size(); i++) {
@@ -287,7 +291,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         eltwisePrecision = Precision::FP32;
         for (int i = 0; i < fusedWith.size(); i++) {
             auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
-            if (eltwiseNode) {
+            if (eltwiseNode && eltwiseNode->isSum()) {
                 eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
                 // TODO(amalyshe): there might be situation when convolution can be executed in BF16,
                 // output is required in FP32 but eltwise inplace tensor would be in BF16
@@ -364,93 +368,16 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
         if (node->getType() == Split || node->getType() == Concatenation)
             continue;
 
-#if defined (COMPILED_CPU_MKLDNN_ELTWISE_NODE)
         auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
-            if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
-                auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
-                if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
-                    // currently there is the only one scale while we need scale by channel :(
-                    ops.append_sum(it->second->buffer().as<float*>()[0], mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision)));
-                }
-            } else {
+        if (eltwiseNode && eltwiseNode->isSum()) {
                 ops.append_sum(1.0, mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision)));
-            }
-
             continue;
         }
-#endif
-
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(),
-                               activationNode->getBeta());
-            continue;
-        }
-#endif
-
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
-            auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-
-            if (initWeights) {
-                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});
-
-                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-                PostOpsIntBlobMemory[blob_idx]->FillZero();
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                             depthwiseLayer->_weights->buffer(),
-                                                             depthwiseLayer->_weights->size() *
-                                                             MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                if (depthwiseNode->isBroadcast()) {
-                    float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
-                    for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
-                    }
-                }
-
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
-                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
-                                                                memory::format::x);
-                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
-                                                                 depthwiseLayer->_biases->buffer(),
-                                                                 depthwiseLayer->_biases->size() *
-                                                                 MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                    if (depthwiseNode->isBroadcast()) {
-                        float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
-                        for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
-                        }
-                    }
-
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
-                    blob_idx += 2;
-                } else {
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         nullptr);
-
-                    blob_idx += 1;
-                }
-            } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                     nullptr,
-                                     nullptr);
-            }
 
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
             continue;
         }
-#endif
 
         auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
         if (quantizeNode) {
index 936df9a..f09611b 100644 (file)
@@ -5,10 +5,8 @@
 #include "mkldnn_def_conv_node.h"
 #include "mkldnn_reorder_node.h"
 #include "mkldnn_input_node.h"
-#include "mkldnn_activation_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_eltwise_node.h"
-#include "mkldnn_depthwise_node.h"
 #include <legacy/ie_layers.h>
 #include <string>
 #include <vector>
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
deleted file mode 100644 (file)
index 486cc96..0000000
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_depthwise_node.h"
-#include "desc_iterator.hpp"
-#include <legacy/ie_layers.h>
-#include <string>
-#include <vector>
-#include <mkldnn_types.h>
-#include <mkldnn_extension_utils.h>
-#include "caseless.hpp"
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-using namespace InferenceEngine::details;
-
-MKLDNNDepthwiseNode::MKLDNNDepthwiseNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
-        : MKLDNNNode(layer, eng, cache) {
-    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
-    });
-    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        if (!isWithBiases())
-            return MKLDNNMemoryDesc();
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc());
-    });
-}
-
-void MKLDNNDepthwiseNode::getSupportedDescriptors() {
-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
-    auto parentOutDims = getParentEdgeAt(0)->getDims();
-
-    if (getParentEdges().size() != 1)
-        THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect number of inputs!";
-    if (parentOutDims != getChildEdgeAt(0)->getDims())
-        THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect dimensions!";
-
-    auto size = static_cast<size_t>(parentOutDims.ndims() == 1 ? parentOutDims[0] : parentOutDims[1]);
-    SizeVector weightDims = { size };
-    MKLDNNDims blocked_weightDims(weightDims);
-
-    auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get());
-    if (wLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << ".";
-
-    InferenceEngine::Blob::Ptr blb = wLayer->_weights;
-    if (blb)
-        realWeightSize = blb->size();
-    internalBlobs.push_back(createInternalBlob(weightDims, true));
-    if (isWithBiases()) {
-        InferenceEngine::Blob::Ptr blb = wLayer->_biases;
-        if (blb)
-            realBiasSize = blb->size();
-        internalBlobs.push_back(createInternalBlob(weightDims, false));
-    }
-
-    for (auto format : getAvailableFormatsForDims(parentOutDims)) {
-        MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format};
-        createDescriptor({in_candidate}, {});
-    }
-}
-
-void MKLDNNDepthwiseNode::initSupportedPrimitiveDescriptors() {
-    if (!supportedPrimitiveDescriptors.empty())
-        return;
-
-    auto parentOutDims = getParentEdgeAt(0)->getDims();
-    if (parentOutDims.ndims() <= 5) {
-        MKLDNNNode::initSupportedPrimitiveDescriptors();
-    } else {
-        createSpecificDescriptor5D();
-        if (specificDesc5DPtr == nullptr)
-            THROW_IE_EXCEPTION << "Cannot create specific MKLDNNDescriptor for depthwise node " << getName();
-        const auto& desc = *specificDesc5DPtr;
-        auto itpd = desc.createPrimitiveDescriptorIterator(getEngine());
-        while (itpd.is_not_end()) {
-            InferenceEngine::LayerConfig config;
-            config.dynBatchSupport = true;
-            for (size_t i = 0; i < descInputNumbers(desc); i++) {
-                InferenceEngine::DataConfig dataConfig;
-                dataConfig.inPlace = -1;
-                dataConfig.constant = false;
-                dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY));
-                config.inConfs.push_back(dataConfig);
-            }
-
-            std::vector<mkldnn::memory::format> outFormats;
-            for (size_t i = 0; i < descOutputNumbers(desc); i++) {
-                InferenceEngine::DataConfig dataConfig;
-                dataConfig.inPlace = canBeInPlace() ? 0 : -1;
-                dataConfig.constant = false;
-                dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY));
-                config.outConfs.push_back(dataConfig);
-
-                auto primDesc = itpd.fetch();
-                auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0);
-                if (dstPrimDesc) {
-                    outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
-                } else {
-                    // This path is needed to correctly handle Deconvolution node
-                    auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0);
-                    if (diffSrcPrimDesc) {
-                        outFormats.emplace_back(static_cast<memory::format>(itpd.diff_src_primitive_desc().desc().data.format));
-                    }
-                }
-            }
-            impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
-
-            supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
-            itpd++;
-        }
-    }
-}
-
-void MKLDNNDepthwiseNode::createPrimitive() {
-    if (prim)
-        return;
-
-    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-    auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
-    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
-        THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
-    if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
-        THROW_IE_EXCEPTION << "Input memory didn't allocate.";
-    if (getSelectedPrimitiveDescriptor() == nullptr)
-        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
-
-    auto createRightPrimitiveDescriptor = [&]() -> depthwise_forward::primitive_desc {
-        auto parentOutDims = getParentEdgeAt(0)->getDims();
-        if (parentOutDims.ndims() <= 5) {
-            return createPrimitiveDescriptor<depthwise_forward::primitive_desc, depthwise_forward::desc>();
-        } else {
-            const PrimitiveDescInfo *selected_pd = getSelectedPrimitiveDescriptor();
-            auto& desc = *specificDesc5DPtr;
-            auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), mkldnn::primitive_attr());
-
-            while (itpd.is_not_end())  {
-                impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
-                if (impl_type == getSelectedPrimitiveDescriptor()->getImplementationType()) {
-                    specificPrepareMemory5D(itpd);
-                    std::shared_ptr<depthwise_forward::desc> selected_desc_ptr = desc;
-                    depthwise_forward::primitive_desc prim_desc = depthwise_forward::primitive_desc(*selected_desc_ptr, getEngine());
-                    return prim_desc;
-                }
-                itpd++;
-            }
-            THROW_IE_EXCEPTION << "Cannot create specific primitive descriptor for depthwise node " << getName() << ".";
-        }
-    };
-
-    auto prim_desc = createRightPrimitiveDescriptor();
-
-    if (isBroadcast()) {
-        float broadcastValue = static_cast<float*>(internalBlobMemory[0]->GetData())[0];
-        size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
-        for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) {
-            static_cast<float*>(internalBlobMemory[0]->GetData())[i] = broadcastValue;
-        }
-
-        if (isWithBiases()) {
-            blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
-            broadcastValue = static_cast<float*>(internalBlobMemory[1]->GetData())[0];
-            for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) {
-                static_cast<float*>(internalBlobMemory[1]->GetData())[i] = broadcastValue;
-            }
-        }
-    } else {
-        size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
-        if (realWeightSize != blbSize)
-            THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect weights!";
-        if (isWithBiases()) {
-            blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
-            if (realBiasSize != blbSize)
-                THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect biases!";
-        }
-    }
-
-    if (isWithBiases()) {
-        prim.reset(new depthwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                         internalBlobMemory[0]->GetPrimitive(),
-                                         internalBlobMemory[1]->GetPrimitive(),
-                                         getChildEdgeAt(0)->getMemory().GetPrimitive()));
-    } else {
-        prim.reset(new depthwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                         internalBlobMemory[0]->GetPrimitive(),
-                                         getChildEdgeAt(0)->getMemory().GetPrimitive()));
-    }
-}
-
-bool MKLDNNDepthwiseNode::created() const {
-    return getType() == Depthwise;
-}
-
-void MKLDNNDepthwiseNode::init() {
-    GenericLayer* depthwiseLayer = getCnnLayer().get();
-    if (depthwiseLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot get CNNLayer.";
-
-    CaselessEq<std::string> comparator;
-    if (comparator(depthwiseLayer->type, "ScaleShift")) {
-        auto *scshLayer = dynamic_cast<ScaleShiftLayer*>(getCnnLayer().get());
-        if (scshLayer == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get scale shift layer " << getName();
-        if (scshLayer->_weights == nullptr)
-            THROW_IE_EXCEPTION << "ScaleShift without weights is not supported";
-
-        algorithm = depthwise_scale_shift;
-        withBiases = scshLayer->_biases != nullptr;
-        broadcast = static_cast<bool>(scshLayer->_broadcast);
-    } else if (comparator(depthwiseLayer->type, "PReLU")) {
-        auto *preluLayer = dynamic_cast<PReLULayer*>(getCnnLayer().get());
-        if (preluLayer == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get PReLU layer " << getName();
-        if (preluLayer->_weights == nullptr)
-            THROW_IE_EXCEPTION << "PReLU without weights is not supported";
-
-        algorithm = depthwise_prelu;
-        withBiases = false;
-        broadcast = preluLayer->_channel_shared;
-    } else {
-        THROW_IE_EXCEPTION << "Unsupported depthwise operation";
-    }
-}
-
-void MKLDNNDepthwiseNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
-                                           const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
-    MKLDNNMemoryDesc in_candidate(inputDesc[0]);
-    MKLDNNMemoryDesc out_candidate(inputDesc[0]);
-    MKLDNNDims weightDims({in_candidate.getDims().ndims() == 1 ? in_candidate.getDims()[0] : in_candidate.getDims()[1]});
-
-    MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x};
-
-    if (isWithBiases()) {
-        MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x};
-        MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
-                new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate)));
-        descs.push_back(desc);
-    } else {
-        MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
-                new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate)));
-        descs.push_back(desc);
-    }
-}
-
-void MKLDNNDepthwiseNode::initOptimalPrimitiveDescriptor() {
-    auto selected_pd = getSelectedPrimitiveDescriptor();
-    if (selected_pd == nullptr)
-        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
-    auto config = selected_pd->getConfig();
-    if (isInitConfig(config))
-        return;
-
-    if (config.inConfs.size() != 1 || config.outConfs.size() != 1 || (!isUninitTensorDesc(config.inConfs[0].desc) &&
-            !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc))
-        THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!";
-
-    if (getParentEdgeAt(0)->getDims().ndims() > 5)
-        return;
-
-    if (!isUninitTensorDesc(config.inConfs[0].desc)) {
-        config.outConfs[0].desc = config.inConfs[0].desc;
-    } else if (!isUninitTensorDesc(config.outConfs[0].desc)) {
-        config.inConfs[0].desc = config.outConfs[0].desc;
-    } else {
-        config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0);
-    }
-
-    initDescriptor(config);
-}
-
-void MKLDNNDepthwiseNode::createSpecificDescriptor5D() {
-    auto parentOutDims = getParentEdgeAt(0)->getDims();
-    MKLDNNDims newDims;
-    for (int i = 0; i < 4; i++)
-        newDims.push_back(parentOutDims[i]);
-    int lastDim = 1;
-    for (int i = 4; i < parentOutDims.ndims(); i++) {
-        lastDim *= parentOutDims[i];
-    }
-    newDims.push_back(lastDim);
-
-    MKLDNNMemoryDesc in_candidate{newDims, MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32), mkldnn::memory::ncdhw};
-    MKLDNNMemoryDesc out_candidate(in_candidate);
-    MKLDNNDims weightDims({in_candidate.getDims()[1]});
-
-    MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x};
-
-    if (isWithBiases()) {
-        MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x};
-        MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
-                new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate)));
-        specificDesc5DPtr = std::make_shared<MKLDNNDescriptor>(desc);
-    } else {
-        MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
-                new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate)));
-        specificDesc5DPtr = std::make_shared<MKLDNNDescriptor>(desc);
-    }
-}
-
-void MKLDNNDepthwiseNode::specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd) {
-    std::vector<MKLDNNMemoryDesc> intDescs;
-    for (auto &it : internalBlobDesc)
-        intDescs.push_back(it(itpd, 0));
-
-    internalBlobMemory.clear();
-    for (size_t i = 0; i < internalBlobs.size(); i++) {
-        const auto &internalBlob = internalBlobs[i];
-
-        auto create = [&] () {
-            auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
-            auto newFormat = newDesc.getFormat();
-            if (newFormat == mkldnn::memory::ncdhw) {
-                newFormat = mkldnn::memory::goihw;
-            }
-            if (newFormat == mkldnn::memory::nchw) {
-                newFormat = mkldnn::memory::oihw;
-            }
-
-            MKLDNNMemory memory{ getEngine() };
-            memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
-
-            MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()));
-            _ptr->Create(intDescs[i]);
-            _ptr->SetData(memory);
-
-            return _ptr;
-        };
-
-        MKLDNNMemoryPtr ptr;
-        if (weightCache != nullptr) {
-            const uint64_t data_hash = weightCache->GetHashFunc().hash(
-                    internalBlob->buffer(), internalBlob->byteSize());
-
-            const std::string string_hash = getName() + "_" + std::to_string(i)
-                                            + "_" + std::to_string(internalBlob->byteSize())
-                                            + "_" + std::to_string(data_hash);
-
-            ptr = weightCache->findOrCreate(string_hash, create);
-        } else {
-            ptr = create();
-        }
-
-        internalBlobMemory.push_back(ptr);
-    }
-}
-
-REG_MKLDNN_PRIM_FOR(MKLDNNDepthwiseNode, Depthwise);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
deleted file mode 100644 (file)
index 01f9648..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include <string>
-#include <memory>
-#include <vector>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNDepthwiseNode : public MKLDNNNode {
-public:
-    MKLDNNDepthwiseNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
-    ~MKLDNNDepthwiseNode() override = default;
-
-    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
-                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
-    void initOptimalPrimitiveDescriptor() override;
-    void getSupportedDescriptors() override;
-    void initSupportedPrimitiveDescriptors() override;
-    void createPrimitive() override;
-    bool created() const override;
-
-    mkldnn::algorithm getAlgorithm() const { return algorithm; }
-    bool isWithBiases() const { return withBiases; }
-    bool isBroadcast() const { return broadcast; }
-
-private:
-    void init() override;
-
-    mkldnn::algorithm algorithm = mkldnn::algorithm::depthwise_scale_shift;
-    size_t realWeightSize = 0;
-    size_t realBiasSize = 0;
-    bool withBiases = false;
-    bool broadcast = false;
-
-    std::shared_ptr<MKLDNNDescriptor> specificDesc5DPtr;
-    void createSpecificDescriptor5D();
-    void specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd);
-};
-
-}  // namespace MKLDNNPlugin
index 7f36301..2c0fc6f 100644 (file)
 #include <mkldnn_extension_utils.h>
 #include "ie_parallel.hpp"
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_activation_node.h"
 #include <map>
 #include "jit_uni_eltwise.hpp"
 #include "jit_uni_quantization.hpp"
+#include "common/emitter.h"
+#include "jit_eltwise_emitters.hpp"
+#include "jit_mkldnn_emitters.hpp"
+#include "ref_eltwise.hpp"
+#include "mkldnn_pooling_node.h"
 
-using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
-using namespace mkldnn::impl;
-using namespace mkldnn::impl::cpu;
 using namespace mkldnn::impl::utils;
+
+using namespace mkldnn::impl::cpu;
 using namespace Xbyak;
 
-#define GET_OFF(field) offsetof(jit_eltwise_fq_call_args, field)
+#define GET_OFF(field) offsetof(jit_eltwise_call_args, field)
 
 template <cpu_isa_t isa>
-struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_fq_generic)
-
-    explicit jit_uni_eltwise_fq_generic(jit_eltwise_fq_params jep, const mkldnn_primitive_attr &attr) : jit_uni_eltwise_fq_kernel(jep, attr), jit_generator() {
-        const auto &p = attr_.post_ops_;
-        for (int i = 0; i < p.len_; i++) {
-            auto &post_op = p.entry_[i];
-            if (post_op.is_eltwise()) {
-                eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
-                        this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
-            } else if (post_op.is_quantization()) {
+struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic)
+
+    explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {
+        Precision exec_prc = Precision::UNSPECIFIED;
+
+        std::set<Precision> supported_precision_intersection = get_supported_precisions(eltwiseNode);
+        for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
+            if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
+                std::set<Precision> prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get());
+
+                std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(),
+                                      prcs.begin(), prcs.end(), std::inserter(supported_precision_intersection, supported_precision_intersection.begin()));
+            }
+        }
+
+        for (auto prc : exec_precisions_priority) {
+            if (std::find(supported_precision_intersection.begin(), supported_precision_intersection.end(), prc) != supported_precision_intersection.end()) {
+                exec_prc = prc;
+                break;
+            }
+        }
+
+        for (int i = 0; i < jep_.inputs_number; i++) {
+            if (jep_.src_prc[i] != exec_prc) {
+                exec_prc = Precision::FP32;
+                break;
+            }
+        }
+
+        if (exec_prc == Precision::UNSPECIFIED) {
+            THROW_IE_EXCEPTION << "Eltwise jitter failed to specify execution precision for Eltwise node with name `" << eltwiseNode.getName() << "`";
+        }
+
+        eltwise_emitter = create_eltwise_emitter(eltwiseNode, exec_prc);
+
+        mkldnn::post_ops post_ops;
+        for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
+            if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
+                post_op_emitters.push_back(create_eltwise_emitter(*eltwiseNode.getFusedWith()[i].get(), exec_prc));
+            } else if (eltwiseNode.getFusedWith()[i].get()->getType() == Quantize) {
+                auto quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(eltwiseNode.getFusedWith()[i].get());
+                quantizeNode->appendPostOps(post_ops);
+
                 quantization_injectors.push_back(std::make_shared<jit_uni_quantization_injector_f32<isa>>(
-                        this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
+                        this, post_ops.get()->entry_[post_ops.get()->len_ - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
             }
         }
 
         this->preamble();
 
-        mov(reg_src0, ptr[reg_params + GET_OFF(src0)]);
-        mov(reg_src1, ptr[reg_params + GET_OFF(src1)]);
+        for (int i = 0; i < jep.inputs_number; i++)
+            mov(get_src_reg(i), ptr[reg_params + GET_OFF(src_ptr[0]) + i * sizeof(size_t)]);
         mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
         mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
-        xor_(reg_oc_off, reg_oc_off);
+        mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
 
+        Xbyak::Label unroll_loop_label;
+        Xbyak::Label unroll_loop_end_label;
         Xbyak::Label main_loop_label;
         Xbyak::Label main_loop_end_label;
         Xbyak::Label tail_loop_label;
@@ -61,131 +99,145 @@ struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit
         if (isa == avx512_common)
             vpxord(vmm_zero, vmm_zero, vmm_zero);
 
-        if (jep.src0_step == 0)
-            uni_vbroadcastss(vmm_src0, ptr[reg_src0]);
-        if (jep.src1_step == 0)
-            uni_vbroadcastss(vmm_src1, ptr[reg_src1]);
+        for (int i = 0; i < jep.inputs_number; i++) {
+            if (jep.src_size[i] == 1)
+                load_vector(get_vmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc, true);
+        }
 
-        L(main_loop_label);
-        {
-            cmp(reg_work_amount, simd_w);
-            jl(main_loop_end_label, T_NEAR);
-
-            if (jep.src0_step != 0)
-                load_vector(vmm_src0, ptr[reg_src0], jep.src0_dt);
-            if (jep.src1_step != 0)
-                load_vector(vmm_src1, ptr[reg_src1], jep.src1_dt);
-
-            switch (jep.eltwise_op) {
-                case EltwiseLayer::eOperation::Sum:
-                    if (isa == cpu::sse42) {
-                        uni_vmovups(vmm_dst, vmm_src0);
-                        uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
-                    } else {
-                        uni_vaddps(vmm_dst, vmm_src0, vmm_src1);
-                    }
-                    break;
-                case EltwiseLayer::eOperation::Prod:
-                    if (isa == cpu::sse42) {
-                        uni_vmovups(vmm_dst, vmm_src0);
-                        uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
-                    } else {
-                        uni_vmulps(vmm_dst, vmm_src0, vmm_src1);
-                    }
-                    break;
-                default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
+        size_t min_src_size = jep.dst_size;
+        for (int i = 0; i < jep.inputs_number; i++) {
+            if (jep.src_size[i] != 1)
+                min_src_size = std::min(min_src_size, jep.src_size[i]);
+        }
+        if (jep_.oc_size > 1)
+            min_src_size = std::min(min_src_size, jep_.oc_size);
+
+        if (min_src_size != jep.dst_size) {
+            bool is_valid_configuration = true;
+            if (jep.dst_size % min_src_size != 0)
+                is_valid_configuration = false;
+
+            for (int i = 0; i < jep.inputs_number; i++) {
+                if (jep.src_size[i] != 1 && jep.src_size[i] != min_src_size && jep.src_size[i] != jep.dst_size)
+                    is_valid_configuration = false;
             }
 
-            int eltwise_inj_idx = 0;
-            int quantization_inj_idx = 0;
-            for (int i = 0; i < p.len_; i++) {
-                auto &post_op = p.entry_[i];
-                if (post_op.is_eltwise()) {
-                    eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1);
-                    eltwise_inj_idx++;
-                } else if (post_op.is_quantization()) {
-                    bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
-                    bool do_rounding = do_dequantization || jep_.dst_dt == data_type::f32 || i != p.len_ - 1;
-                    int s_idx = vmm_dst.getIdx();
+            if (jep_.oc_size > 1 && jep_.oc_size != min_src_size && jep_.oc_size != jep.dst_size)
+                is_valid_configuration = false;
+
+            if (!is_valid_configuration)
+                THROW_IE_EXCEPTION << "Eltwise jitter has invalid configuration for Eltwise node with name `" << eltwiseNode.getName() << "`";
+
+            L(unroll_loop_label);
+            {
+                size_t loop_step = min_src_size;
+                size_t vec_step = cpu_isa_traits<isa>::vlen / exec_prc.size();
 
-                    quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
-                    quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0);
+                cmp(reg_work_amount, loop_step);
+                jl(unroll_loop_end_label, T_NEAR);
 
-                    quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off);
-                    quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding);
+                for (int j = 0; j < min_src_size / vec_step; j++) {
+                    for (int i = 0; i < jep.inputs_number; i++) {
+                        if (jep.src_size[i] != 1)
+                            load_vector(get_vmm_reg(i), ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], jep.src_prc[i], exec_prc, false);
+                    }
+
+                    compute_eltwise_op();
 
-                    quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off);
-                    quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0);
+                    apply_post_ops(false, jep_.oc_size > 1 ? j * vec_step * sizeof(float) : 0);
 
-                    quantization_inj_idx++;
+                    store_vector(ptr[reg_dst + j * vec_step * jep.dst_prc.size()], vmm_dst, exec_prc, jep.dst_prc);
                 }
-            }
 
-            store_vector(ptr[reg_dst], vmm_dst, jep.dst_dt);
+                int tail_start = min_src_size - min_src_size % vec_step;
+                for (int j = tail_start; j < min_src_size; j++) {
+                    for (int i = 0; i < jep.inputs_number; i++) {
+                        if (jep.src_size[i] != 1)
+                            load_scalar(get_xmm_reg(i), ptr[get_src_reg(i) + j * jep.src_prc[i].size()], jep.src_prc[i], exec_prc);
+                    }
 
-            if (jep.src0_step != 0)
-                add(reg_src0, jep.src0_step * jep.src0_data_size * simd_w);
-            if (jep.src1_step != 0)
-                add(reg_src1, jep.src1_step * jep.src1_data_size * simd_w);
-            add(reg_dst, jep.dst_step * jep.dst_data_size * simd_w);
-            sub(reg_work_amount, simd_w);
-            add(reg_oc_off, simd_w * sizeof(float));
+                    compute_eltwise_op();
 
-            jmp(main_loop_label, T_NEAR);
-        }
+                    apply_post_ops(true, jep_.oc_size > 1 ? j * sizeof(float) : 0);
 
-        L(main_loop_end_label);
+                    store_scalar(ptr[reg_dst + j * jep.dst_prc.size()], xmm_dst, exec_prc, jep.dst_prc);
+                }
 
-        L(tail_loop_label);
-        {
-            cmp(reg_work_amount, 1);
-            jl(tail_loop_end_label, T_NEAR);
+                for (int i = 0; i < jep.inputs_number; i++)
+                    if (jep.src_size[i] == jep.dst_size)
+                        add(get_src_reg(i), jep.src_prc[i].size() * loop_step);
 
-            if (jep.src0_step != 0)
-                load_scalar(xmm_src0, ptr[reg_src0], jep.src0_dt);
-            if (jep.src1_step != 0)
-                load_scalar(xmm_src1, ptr[reg_src1], jep.src1_dt);
+                add(reg_dst, jep.dst_prc.size() * loop_step);
+                sub(reg_work_amount, loop_step);
+                if (jep_.oc_size > 1 && jep_.oc_size != min_src_size)
+                    add(reg_oc_off, loop_step * sizeof(float));
 
-            switch (jep.eltwise_op) {
-                case EltwiseLayer::eOperation::Sum: uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break;
-                case EltwiseLayer::eOperation::Prod: uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break;
-                default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
+                jmp(unroll_loop_label, T_NEAR);
             }
 
-            int eltwise_inj_idx = 0;
-            int quantization_inj_idx = 0;
-            for (int i = 0; i < p.len_; i++) {
-                auto &post_op = p.entry_[i];
-                if (post_op.is_eltwise()) {
-                    eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1);
-                    eltwise_inj_idx++;
-                } else if (post_op.is_quantization()) {
-                    bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
-                    bool do_rounding = do_dequantization || jep_.dst_dt == data_type::f32 || i != p.len_ - 1;
-                    int s_idx = vmm_dst.getIdx();
-
-                    quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
-                    quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, true);
+            L(unroll_loop_end_label);
+        }
 
-                    quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off);
-                    quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, true);
+        if (min_src_size == jep.dst_size) {
+            L(main_loop_label);
+            {
+                size_t loop_step = cpu_isa_traits<isa>::vlen / exec_prc.size();
 
-                    quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off);
-                    quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, true);
+                cmp(reg_work_amount, loop_step);
+                jl(main_loop_end_label, T_NEAR);
 
-                    quantization_inj_idx++;
+                for (int i = 0; i < jep.inputs_number; i++) {
+                    if (jep.src_size[i] != 1)
+                        load_vector(get_vmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc, false);
                 }
+
+                compute_eltwise_op();
+
+                apply_post_ops(false);
+
+                store_vector(ptr[reg_dst], vmm_dst, exec_prc, jep.dst_prc);
+
+                for (int i = 0; i < jep.inputs_number; i++)
+                    if (jep.src_size[i] != 1)
+                        add(get_src_reg(i), jep.src_prc[i].size() * loop_step);
+
+                add(reg_dst, jep.dst_prc.size() * loop_step);
+                sub(reg_work_amount, loop_step);
+                if (jep_.oc_size > 1)
+                    add(reg_oc_off, loop_step * sizeof(float));
+
+                jmp(main_loop_label, T_NEAR);
+            }
+
+            L(main_loop_end_label);
+        }
+
+        L(tail_loop_label);
+        {
+            size_t loop_step = 1;
+
+            cmp(reg_work_amount, loop_step);
+            jl(tail_loop_end_label, T_NEAR);
+
+            for (int i = 0; i < jep.inputs_number; i++) {
+                if (jep.src_size[i] != 1)
+                    load_scalar(get_xmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc);
             }
 
-            store_scalar(ptr[reg_dst], xmm_dst, jep.dst_dt);
+            compute_eltwise_op();
+
+            apply_post_ops(true);
+
+            store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc);
 
-            if (jep.src0_step != 0)
-                add(reg_src0, jep.src0_step * jep.src0_data_size);
-            if (jep.src1_step != 0)
-                add(reg_src1, jep.src1_step * jep.src1_data_size);
-            add(reg_dst, jep.dst_step * jep.dst_data_size);
-            sub(reg_work_amount, 1);
-            add(reg_oc_off, 1 * sizeof(float));
+            for (int i = 0; i < jep.inputs_number; i++)
+                if (jep.src_size[i] != 1)
+                    add(get_src_reg(i), jep.src_prc[i].size() * loop_step);
+
+            add(reg_dst, jep.dst_prc.size() * loop_step);
+            sub(reg_work_amount, loop_step);
+            if (jep_.oc_size > 1)
+                add(reg_oc_off, loop_step * sizeof(float));
 
             jmp(tail_loop_label, T_NEAR);
         }
@@ -194,8 +246,10 @@ struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit
 
         this->postamble();
 
-        for (auto& inj : eltwise_injectors)
-            inj->prepare_table();
+        eltwise_emitter->emit_table();
+        for (int i = 0; i < post_op_emitters.size(); i++) {
+            post_op_emitters[i]->emit_table();
+        }
 
         ker_ = (decltype(ker_)) this->getCode();
     }
@@ -203,95 +257,306 @@ struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit
 private:
     using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
 
-    const int simd_w = cpu_isa_traits<isa>::vlen / sizeof(float);
+    Reg64 get_src_reg(int idx) {
+        return Reg64(r8.getIdx() + idx);
+    }
+
+    Vmm get_vmm_reg(int idx) {
+        return Vmm(1 + idx);
+    }
+
+    Vmm get_aux_vmm(int idx) {
+        return Vmm(10 + idx);
+    }
+
+    Xmm get_xmm_reg(int idx) {
+        return Xmm(get_vmm_reg(idx).getIdx());
+    }
+
+    Reg64 reg_dst = rbx;
+    Reg64 reg_work_amount = rdx;
 
-    Reg64 reg_src0 = r8;
-    Reg64 reg_src1 = r9;
-    Reg64 reg_dst = r10;
-    Reg64 reg_work_amount = r11;
-    Reg64 reg_oc_off = r13;
+    Reg64 reg_oc_off = abi_not_param1;
     Reg64 reg_params = abi_param1;
 
-    Reg8 reg_tmp_8 = r12b;
-    Reg32 reg_tmp_32 = r12d;
-    Reg64 reg_tmp_64 = r12;
+    Reg8 reg_tmp_8 = Reg8(r15.getIdx());
+    Reg32 reg_tmp_32 = Reg32(r15.getIdx());
+    Reg64 reg_tmp_64 = Reg64(r15.getIdx());
 
-    Reg64 reg_d_weights = r14;
-    Reg64 reg_d_bias = r15;
+    Reg64 reg_d_weights = rbp;
+    Reg64 reg_d_bias = rsi;
 
-    Vmm vmm_src0 = Vmm(0);
-    Vmm vmm_src1 = Vmm(1);
-    Vmm vmm_dst = Vmm(2);
-    Xmm xmm_src0 = Xmm(0);
-    Xmm xmm_src1 = Xmm(1);
-    Xmm xmm_dst = Xmm(2);
+    Vmm vmm_dst = Vmm(9);
+    Xmm xmm_dst = Xmm(9);
 
-    Vmm vmm_d_weights = Vmm(3);
-    Vmm vmm_d_bias = Vmm(4);
+    Vmm vmm_d_weights = Vmm(12);
+    Vmm vmm_d_bias = Vmm(13);
+    Vmm vmm_zero = Vmm(15);
 
-    Vmm vmm_zero = Vmm(5);
+    std::shared_ptr<jit_emitter> eltwise_emitter = nullptr;
+    std::vector<std::shared_ptr<jit_emitter>> post_op_emitters = {};
 
-    std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
-    std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
+    std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors = {};
 
-    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
-        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
-                uni_vmovups(vmm_src, op);
-                break;
-            case memory::s8:
-                uni_vpmovsxbd(vmm_src, op);
-                break;
-            case memory::u8:
-                uni_vpmovzxbd(vmm_src, op);
-                break;
-            default:
-                assert(!"unknown dst_dt");
+    std::vector<Precision> exec_precisions_priority = {
+        Precision::U8,
+        Precision::I8,
+        Precision::U16,
+        Precision::I16,
+        Precision::BF16,
+        Precision::I32,
+        Precision::FP32
+    };
+
+    std::set<Precision> get_supported_precisions(MKLDNNNode& node) {
+        auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(node);
+        switch (eltwiseNode.getOpType()) {
+            case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt:
+            case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+                return jit_mkldnn_emitter::get_supported_precisions();
+            case Add:               return jit_add_emitter::get_supported_precisions();
+            case MulAdd:            return jit_mul_add_emitter::get_supported_precisions();
+            case Subtract:          return jit_subtract_emitter::get_supported_precisions();
+            case Multiply:          return jit_multiply_emitter::get_supported_precisions();
+            case Divide:            return jit_divide_emitter::get_supported_precisions();
+            case FloorMod:          return jit_floor_mod_emitter::get_supported_precisions();
+            case Mod:               return jit_mod_emitter::get_supported_precisions();
+            case Maximum:           return jit_maximum_emitter::get_supported_precisions();
+            case Minimum:           return jit_minimum_emitter::get_supported_precisions();
+            case SquaredDifference: return jit_squared_difference_emitter::get_supported_precisions();
+            case PowerDynamic:      return jit_power_dynamic_emitter::get_supported_precisions();
+            case Equal:             return jit_equal_emitter::get_supported_precisions();
+            case NotEqual:          return jit_not_equal_emitter::get_supported_precisions();
+            case Greater:           return jit_greater_emitter::get_supported_precisions();
+            case GreaterEqual:      return jit_greater_equal_emitter::get_supported_precisions();
+            case Less:              return jit_less_emitter::get_supported_precisions();
+            case LessEqual:         return jit_less_equal_emitter::get_supported_precisions();
+            case LogicalAnd:        return jit_logical_and_emitter::get_supported_precisions();
+            case LogicalOr:         return jit_logical_or_emitter::get_supported_precisions();
+            case LogicalXor:        return jit_logical_xor_emitter::get_supported_precisions();
+            case LogicalNot:        return jit_logical_not_emitter::get_supported_precisions();
+            case PowerStatic:       return jit_power_static_emitter::get_supported_precisions();
+            case Prelu:             return jit_prelu_emitter::get_supported_precisions();
+            default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise emitter";
+        }
+    }
+
+    std::shared_ptr<jit_emitter> create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) {
+        auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(node);
+        switch (eltwiseNode.getOpType()) {
+            case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt:
+            case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+                                    return std::make_shared<jit_mkldnn_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Add:               return std::make_shared<jit_add_emitter>(this, isa, eltwiseNode, exec_prec);
+            case MulAdd:            return std::make_shared<jit_mul_add_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Subtract:          return std::make_shared<jit_subtract_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Multiply:          return std::make_shared<jit_multiply_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Divide:            return std::make_shared<jit_divide_emitter>(this, isa, eltwiseNode, exec_prec);
+            case FloorMod:          return std::make_shared<jit_floor_mod_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Mod:               return std::make_shared<jit_mod_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Maximum:           return std::make_shared<jit_maximum_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Minimum:           return std::make_shared<jit_minimum_emitter>(this, isa, eltwiseNode, exec_prec);
+            case SquaredDifference: return std::make_shared<jit_squared_difference_emitter>(this, isa, eltwiseNode, exec_prec);
+            case PowerDynamic:      return std::make_shared<jit_power_dynamic_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Equal:             return std::make_shared<jit_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+            case NotEqual:          return std::make_shared<jit_not_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Greater:           return std::make_shared<jit_greater_emitter>(this, isa, eltwiseNode, exec_prec);
+            case GreaterEqual:      return std::make_shared<jit_greater_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Less:              return std::make_shared<jit_less_emitter>(this, isa, eltwiseNode, exec_prec);
+            case LessEqual:         return std::make_shared<jit_less_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+            case LogicalAnd:        return std::make_shared<jit_logical_and_emitter>(this, isa, eltwiseNode, exec_prec);
+            case LogicalOr:         return std::make_shared<jit_logical_or_emitter>(this, isa, eltwiseNode, exec_prec);
+            case LogicalXor:        return std::make_shared<jit_logical_xor_emitter>(this, isa, eltwiseNode, exec_prec);
+            case LogicalNot:        return std::make_shared<jit_logical_not_emitter>(this, isa, eltwiseNode, exec_prec);
+            case PowerStatic:       return std::make_shared<jit_power_static_emitter>(this, isa, eltwiseNode, exec_prec);
+            case Prelu:             return std::make_shared<jit_prelu_emitter>(this, isa, eltwiseNode, exec_prec);
+            default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise emitter";
+        }
+    }
+
+    inline void compute_eltwise_op() {
+        std::vector<size_t> in_idxs;
+        std::vector<size_t> aux_idxs;
+        for (int i = 0; i < eltwise_emitter->get_inputs_num(); i++)
+            in_idxs.push_back(get_vmm_reg(i).getIdx());
+        for (int i = 0; i < eltwise_emitter->aux_vecs_count(); i++)
+            aux_idxs.push_back(get_aux_vmm(i).getIdx());
+
+        std::vector<size_t> out_idxs;
+        out_idxs.push_back(vmm_dst.getIdx());
+
+        eltwise_emitter->emit(in_idxs, out_idxs, aux_idxs);
+    }
+
+    inline void apply_post_ops(bool is_scalar, int offset = 0) {
+        int input_idx = eltwise_emitter->get_inputs_num();
+        int eltwise_post_op_idx = 0;
+        int quantization_post_op_idx = 0;
+        for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
+            if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
+                std::vector<size_t> in_idxs;
+                std::vector<size_t> aux_idxs;
+                in_idxs.push_back(vmm_dst.getIdx());
+                for (int j = 1; j < post_op_emitters[eltwise_post_op_idx]->get_inputs_num(); j++)
+                    in_idxs.push_back(get_vmm_reg(input_idx++).getIdx());
+                for (int j = 0; j < post_op_emitters[eltwise_post_op_idx]->aux_vecs_count(); j++)
+                    aux_idxs.push_back(get_aux_vmm(j).getIdx());
+
+                std::vector<size_t> out_idxs;
+                out_idxs.push_back(vmm_dst.getIdx());
+
+                post_op_emitters[eltwise_post_op_idx]->emit(in_idxs, out_idxs, aux_idxs);
+
+                eltwise_post_op_idx++;
+            } else {
+                auto quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(eltwiseNode.getFusedWith()[i].get());
+
+                bool do_dequantization = quantizeNode->getAlgorithm() == mkldnn::quantization_quantize_dequantize;
+                bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != eltwiseNode.getFusedWith().size() - 1;
+                int s_idx = vmm_dst.getIdx();
+
+                quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_oc_off);
+                quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1);
+
+                quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_oc_off);
+                quantization_injectors[quantization_post_op_idx]->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding,
+                                                                                            is_scalar, jep_.oc_size == 1);
+
+                quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_oc_off);
+                quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1);
+
+                quantization_post_op_idx++;
+            }
         }
+    }
+
+    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc, bool broadcast) {
+        Xmm xmm_src = Xmm(vmm_src.getIdx());
+
+        if (broadcast) {
+            load_scalar(xmm_src, op, src_prc, dst_prc);
+            uni_vbroadcastss(vmm_src, xmm_src);
+        } else {
+            switch (src_prc) {
+                case Precision::FP32:
+                case Precision::I32:
+                    uni_vmovups(vmm_src, op);
+                    break;
+                case Precision::BF16:
+                    vpmovzxwd(vmm_src, op);
+                    uni_vpslld(vmm_src, vmm_src, 16);
+                    break;
+                case Precision::U16:
+                    uni_vpmovzxwd(vmm_src, op);
+                    break;
+                case Precision::I16:
+                    uni_vpmovsxwd(vmm_src, op);
+                    break;
+                case Precision::I8:
+                    uni_vpmovsxbd(vmm_src, op);
+                    break;
+                case Precision::U8:
+                    uni_vpmovzxbd(vmm_src, op);
+                    break;
+                default:
+                    assert(!"unknown src_prc");
+            }
 
-        if (src_dt != data_type::f32) {
-            uni_vcvtdq2ps(vmm_src, vmm_src);
+            switch (dst_prc) {
+                case Precision::FP32:
+                    if (src_prc != Precision::FP32 && src_prc != Precision::BF16)
+                        uni_vcvtdq2ps(vmm_src, vmm_src);
+                    break;
+                case Precision::I32:
+                    break;
+                default:
+                    assert(!"unknown dst_prc");
+            }
         }
     }
 
-    inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
-        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+    inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc) {
+        switch (src_prc) {
+            case Precision::FP32:
+            case Precision::I32:
                 movss(xmm_src, op);
                 break;
-            case memory::s8:
+            case Precision::BF16:
+                uni_vpinsrw(xmm_src, xmm_src, op, 0);
+                uni_vpslld(xmm_src, xmm_src, 16);
+                break;
+            case Precision::I16:
+                uni_vpinsrw(xmm_src, xmm_src, op, 0);
+                uni_vpmovsxwd(xmm_src, op);
+                break;
+            case Precision::U16:
+                uni_vpinsrw(xmm_src, xmm_src, op, 0);
+                uni_vpmovzxwd(xmm_src, op);
+                break;
+            case Precision::I8:
                 movsx(reg_tmp_32, op);
                 movq(xmm_src, reg_tmp_64);
                 break;
-            case memory::u8:
+            case Precision::U8:
                 movzx(reg_tmp_32, op);
                 movq(xmm_src, reg_tmp_64);
                 break;
             default:
-                assert(!"unknown dst_dt");
+                assert(!"unknown src_prc");
         }
 
-        if (src_dt != data_type::f32) {
-            uni_vcvtdq2ps(xmm_src, xmm_src);
+        switch (dst_prc) {
+            case Precision::FP32:
+                if (src_prc != Precision::FP32 && src_prc != Precision::BF16)
+                    uni_vcvtdq2ps(xmm_src, xmm_src);
+                break;
+            case Precision::I32:
+                break;
+            default:
+                assert(!"unknown dst_prc");
         }
     }
 
-    inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) {
+    inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, Precision src_prc, Precision dst_prc) {
         Xmm xmm_dst = Xmm(vmm_dst.getIdx());
         Ymm ymm_dst = Ymm(vmm_dst.getIdx());
 
-        if (dst_dt != data_type::f32) {
-            uni_vcvtps2dq(vmm_dst, vmm_dst);
+        switch (src_prc) {
+            case Precision::FP32:
+                if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16)
+                    uni_vcvtps2dq(vmm_dst, vmm_dst);
+                break;
+            case Precision::I32:
+                break;
+            default:
+                assert(!"unknown src_prc");
         }
 
-        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+        switch (dst_prc) {
+            case Precision::FP32:
+            case Precision::I32:
                 uni_vmovups(op, vmm_dst);
                 break;
-            case memory::s8:
+            case Precision::BF16:
+                vcvtneps2bf16(ymm_dst, vmm_dst);
+                uni_vmovups(op, ymm_dst);
+                break;
+            case Precision::I16:
+                if (isa == avx512_common) {
+                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
+                    vpmovusdw(op, vmm_dst);
+                } else {
+                    uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
+                }
+                break;
+            case Precision::U16:
+                if (isa == avx512_common) {
+                    vpmovsdw(op, vmm_dst);
+                } else {
+                    uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
+                }
+                break;
+            case Precision::I8:
                 if (isa == avx512_common) {
                     vmaxps(vmm_dst, vmm_zero, vmm_dst);
                     vpmovsdb(op, vmm_dst);
@@ -306,7 +571,7 @@ private:
                         movd(op, xmm_dst);
                 }
                 break;
-            case memory::u8:
+            case Precision::U8:
                 if (isa == avx512_common) {
                     vpmovusdb(op, vmm_dst);
                 } else {
@@ -321,2377 +586,1069 @@ private:
                 }
                 break;
             default:
-                assert(!"unknown dst_dt");
+                assert(!"unknown dst_prc");
         }
     }
 
-    inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
-        if (dst_dt != data_type::f32) {
-            uni_vcvtps2dq(xmm_dst, xmm_dst);
+    inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, Precision src_prc, Precision dst_prc) {
+        switch (src_prc) {
+            case Precision::FP32:
+                if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16)
+                    uni_vcvtps2dq(xmm_dst, xmm_dst);
+                break;
+            case Precision::I32:
+                break;
+            default:
+                assert(!"unknown src_prc");
         }
 
-        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+        switch (dst_prc) {
+            case Precision::FP32:
+            case Precision::I32:
                 movss(op, xmm_dst);
                 break;
-            case memory::s8:
+            case Precision::BF16:
+                uni_vpsrld(xmm_dst, xmm_dst, 16);
+                uni_vpextrw(op, xmm_dst, 0x0);
+                break;
+            case Precision::I16:
+                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+                break;
+            case Precision::U16:
+                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+                break;
+            case Precision::I8:
                 uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                 uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
                 movq(reg_tmp_64, xmm_dst);
                 mov(op, reg_tmp_8);
                 break;
-            case memory::u8:
+            case Precision::U8:
                 uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                 uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                 movq(reg_tmp_64, xmm_dst);
                 mov(op, reg_tmp_8);
                 break;
             default:
-                assert(!"unknown dst_dt");
+                assert(!"unknown dst_prc");
         }
     }
 };
 
 MKLDNNEltwiseNode::MKLDNNEltwiseNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-        MKLDNNNode(layer, eng, cache), eltiwse_fq_kernel(nullptr) {
-    op = EltwiseLayer::Sum;
+        MKLDNNNode(layer, eng, cache) {
 }
 
-bool MKLDNNEltwiseNode::isSum() {
-    auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
-    if (eltwiseLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName();
-    return eltwiseLayer->_operation == EltwiseLayer::Sum;
-}
+InferenceEngine::details::caseless_map<std::string, std::function<void(GenericLayer*, EltwiseOpType&, mkldnn::algorithm&, float&, float&)>>
+MKLDNNEltwiseNode::initializers = {
+        {"relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
+            beta = 0.0f;
+            opType = Relu;
+            algorithm = mkldnn::eltwise_relu;
+        }},
+        {"gelu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Gelu;
+            algorithm = mkldnn::eltwise_gelu;
+        }},
+        {"elu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
+            beta = 0.0f;
+            opType = Elu;
+            algorithm = mkldnn::eltwise_elu;
+        }},
+        {"tanh", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Tanh;
+            algorithm = mkldnn::eltwise_tanh;
+        }},
+        {"sigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Logistic;
+            algorithm = mkldnn::eltwise_logistic;
+        }},
+        {"logistic", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Logistic;
+            algorithm = mkldnn::eltwise_logistic;
+        }},
+        {"square", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Square;
+            algorithm = mkldnn::eltwise_square;
+        }},
+        {"abs", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Abs;
+            algorithm = mkldnn::eltwise_abs;
+        }},
+        {"sqrt", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Sqrt;
+            algorithm = mkldnn::eltwise_sqrt;
+        }},
+        {"linear", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
+            beta = activationLayer->GetParamAsFloat("beta", 0.0f);
+            opType = Linear;
+            algorithm = mkldnn::eltwise_linear;
+        }},
+        {"bounded_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("alpha", 0.0f);
+            beta = 0.0f;
+            opType = BoundedRelu;
+            algorithm = mkldnn::eltwise_bounded_relu;
+        }},
+        {"soft_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = SoftRelu;
+            algorithm = mkldnn::eltwise_soft_relu;
+        }},
+        {"relu6", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("n", 6.0f);
+            beta = 0.0f;
+            opType = Relu6;
+            algorithm = mkldnn::eltwise_bounded_relu;
+        }},
+        {"clamp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("max", 1.0f);
+            beta = activationLayer->GetParamAsFloat("min", 0.0f);
+            opType = Clamp;
+            algorithm = mkldnn::eltwise_clamp;
+        }},
+        {"exp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Exp;
+            algorithm = mkldnn::eltwise_exp;
+        }},
+        {"not", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = LogicalNot;
+        }},
+        {"swish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
+            beta = 0.0f;
+            opType = Swish;
+            algorithm = mkldnn::eltwise_swish;
+        }},
+        {"hswish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Hswish;
+            algorithm = mkldnn::eltwise_hswish;
+        }},
+        {"mish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Mish;
+            algorithm = mkldnn::eltwise_mish;
+        }},
+        {"hsigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+            alpha = 0.0f;
+            beta = 0.0f;
+            opType = Hsigmoid;
+            algorithm = mkldnn::eltwise_hsigmoid;
+        }},
+};
 
-bool MKLDNNEltwiseNode::isUnitScales() {
-    auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
-    if (eltwiseLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName();
+void MKLDNNEltwiseNode::init() {
+    InferenceEngine::details::CaselessEq<std::string> comparator;
+    auto layerType = getCnnLayer().get()->type;
 
-    if (eltwiseLayer->coeff.empty())
-        return true;
+    auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
+    if (eltwiseLayer) {
+        if (!eltwiseLayer->coeff.empty())
+            THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support input coefficients.";
+
+        switch (eltwiseLayer->_operation) {
+            case EltwiseLayer::Sum: eltwiseOp = Add; break;
+            case EltwiseLayer::Prod: eltwiseOp = Multiply; break;
+            case EltwiseLayer::Max: eltwiseOp = Maximum; break;
+            case EltwiseLayer::Sub: eltwiseOp = Subtract; break;
+            case EltwiseLayer::Min: eltwiseOp = Minimum; break;
+            case EltwiseLayer::Div: eltwiseOp = Divide; break;
+            case EltwiseLayer::Squared_diff: eltwiseOp = SquaredDifference; break;
+            case EltwiseLayer::Floor_mod: eltwiseOp = FloorMod; break;
+            case EltwiseLayer::Pow: eltwiseOp = PowerDynamic; break;
+            case EltwiseLayer::Equal: eltwiseOp = Equal; break;
+            case EltwiseLayer::Not_equal: eltwiseOp = NotEqual; break;
+            case EltwiseLayer::Greater: eltwiseOp = Greater; break;
+            case EltwiseLayer::Greater_equal: eltwiseOp = GreaterEqual; break;
+            case EltwiseLayer::Less: eltwiseOp = Less; break;
+            case EltwiseLayer::Less_equal: eltwiseOp = LessEqual; break;
+            case EltwiseLayer::Logical_AND: eltwiseOp = LogicalAnd; break;
+            case EltwiseLayer::Logical_OR: eltwiseOp = LogicalOr; break;
+            case EltwiseLayer::Logical_XOR: eltwiseOp = LogicalXor; break;
+            default: THROW_IE_EXCEPTION << "Unsupported algorithm for Eltwise node with name `" << getName() << "`.";
+        }
+    } else if (comparator(layerType, "mod")) {
+        eltwiseOp = Mod;
+    } else if (comparator(layerType, "power")) {
+        eltwiseOp = PowerStatic;
+
+        auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(getCnnLayer().get());
+        if (powerLayer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot convert power layer.";
+
+        alpha = powerLayer->power;
+        beta = powerLayer->scale;
+        gamma = powerLayer->offset;
+    } else if (comparator(layerType, "scaleshift")) {
+        if (getCnnLayer().get()->blobs.size() == 2) {
+            eltwiseOp = MulAdd;
+            eltwiseAlgorithm = mkldnn::depthwise_scale_shift;
+        } else {
+            eltwiseOp = Multiply;
+        }
+    } else if (comparator(layerType, "prelu")) {
+        eltwiseOp = Prelu;
+        eltwiseAlgorithm = mkldnn::depthwise_prelu;
+    } else if (comparator(layerType, "activation") && initializers.find(getCnnLayer().get()->GetParamAsString("type")) != initializers.end()) {
+        initializers[getCnnLayer().get()->GetParamAsString("type")](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta);
+    } else if (comparator(layerType, "relu") ||
+               comparator(layerType, "gelu") ||
+               comparator(layerType, "elu") ||
+               comparator(layerType, "sigmoid") ||
+               comparator(layerType, "logistic") ||
+               comparator(layerType, "tanh") ||
+               comparator(layerType, "relu6") ||
+               comparator(layerType, "exp") ||
+               comparator(layerType, "not") ||
+               comparator(layerType, "clamp") ||
+               comparator(layerType, "swish") ||
+               comparator(layerType, "hswish") ||
+               comparator(layerType, "mish") ||
+               comparator(layerType, "hsigmoid")) {
+        initializers[layerType](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta);
+    } else {
+        THROW_IE_EXCEPTION << "Unsupported algorithm for Eltwise node with name `" << getName() << "`.";
+    }
+}
 
-    for (auto scale : eltwiseLayer->coeff) {
-        if (scale != 1.0f)
-            return false;
+size_t MKLDNNEltwiseNode::getOpInputsNum() const {
+    switch (getOpType()) {
+        case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt: case PowerStatic:
+        case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+        case LogicalNot:
+            return 1;
+        case Add: case Subtract: case Multiply: case Divide: case FloorMod: case Mod: case Maximum: case Minimum: case SquaredDifference:
+        case PowerDynamic: case Equal: case NotEqual: case Greater: case GreaterEqual: case Less: case LessEqual: case LogicalAnd:
+        case LogicalOr: case LogicalXor: case Prelu:
+            return 2;
+        case MulAdd:
+            return 3;
+        default: THROW_IE_EXCEPTION << "Unsupported operation for Eltwise node with name `" << getName() << "`.";
     }
+}
 
-    return true;
+bool MKLDNNEltwiseNode::isSum() {
+    return eltwiseOp == Add;
 }
 
 bool MKLDNNEltwiseNode::isWithBroadcast() {
-    bool withBroadcast = false;
     auto oDims = outDims[0].ToSizeVector();
     for (size_t i = 0; i < inDims.size(); i++) {
         auto iDims = inDims[i].ToSizeVector();
-        for (size_t j = 1; j <= iDims.size(); j++) {
-            if (oDims[oDims.size() - j] != iDims[iDims.size() - j]) {
-                if (iDims[iDims.size() - j] == 1) {
-                    withBroadcast = true;
-                } else {
-                    THROW_IE_EXCEPTION << "Incorrect dimensions for broadcasting for " << getName();
-                }
-            }
-            if (iDims.size() < oDims.size())
-                withBroadcast = true;
-        }
-        if (iDims.size() == 0 && oDims.size())
-            withBroadcast = true;
+        if (iDims != oDims)
+            return true;
     }
 
-    return withBroadcast;
+    return false;
 }
 
 void MKLDNNEltwiseNode::getSupportedDescriptors() {
-    auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
-
-    if (eltwiseLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot convert eltwise layer.";
-    op = eltwiseLayer->_operation;
-
-    if (getParentEdges().size() < 2)
+    if (getParentEdges().size() < 1)
         THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
     if (getChildEdges().empty())
         THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
-    if (op == EltwiseLayer::Squared_diff)
-        if (getParentEdges().size() != 2)
-            THROW_IE_EXCEPTION  << "Incorrect number of input edges for layer " << getName() << " for operation squared_diff.\n"
-                << "Expected: 2\n" << "Actual: " << getParentEdges().size();
+}
 
-    auto outDims = getChildEdgeAt(0)->getDims();
-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        auto inDims = getParentEdgeAt(i)->getDims();
-        batch_dim = std::min(batch_dim, 5 - inDims.ndims());
-    }
+void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
+    std::vector<Precision> supportedPrecisions = {
+            Precision::FP32,
+            Precision::U8,
+            Precision::I8,
+            Precision::U16,
+            Precision::I16,
+            Precision::BF16,
+            Precision::I32
+    };
 
-    broadcast = isWithBroadcast();
-    if (broadcast) {
-        auto outDims = getChildEdgeAt(0)->getDims();
-        for (size_t i = 0; i < getParentEdges().size(); i++) {
-            auto inDims = getParentEdgeAt(i)->getDims();
-            if (inDims.ndims() > 5 || outDims.ndims() > 5)
-                THROW_IE_EXCEPTION << "Eltwise node in broadcasting mode doesn't support more than 5 dims for blobs";
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    canUseOptimizedImpl = mayiuse(cpu::sse42);
+
+    size_t expectedInputsNum = getOpInputsNum();
+    for (auto& postOp : fusedWith) {
+        auto* eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode*>(postOp.get());
+        if (eltwiseNode != nullptr) {
+            expectedInputsNum += eltwiseNode->getOpInputsNum() - 1;
         }
     }
+    if (getParentEdges().size() > MAX_ELTWISE_INPUTS)
+        THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support more than " << MAX_ELTWISE_INPUTS
+                           << " inputs (actual = " << getParentEdges().size() << ")";
 
-    bool with_coeffs = !eltwiseLayer->coeff.empty();
-    if (op != EltwiseLayer::Sum && with_coeffs)
-        THROW_IE_EXCEPTION << "Only sum operation supports operands coefficients";
+    if (expectedInputsNum != getParentEdges().size())
+        THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input number of inputs: expected = " << expectedInputsNum
+                           << " (actual = " << getParentEdges().size() << ")";
 
-    if (with_coeffs && eltwiseLayer->coeff.size() != getParentEdges().size())
-        THROW_IE_EXCEPTION << "Number of provided coefficients is not equal to number of operands";
+    std::vector<InferenceEngine::Precision> inputPrecisions;
+    for (int i = 0; i < getCnnLayer()->insData.size(); i++) {
+        inputPrecisions.push_back(getCnnLayer()->insData[i].lock()->getPrecision());
+    }
 
-    if (with_coeffs && eltwiseLayer->precision != Precision::FP32)
-        THROW_IE_EXCEPTION << "Sum with coefficients supports only FP32 precision";
+    for (auto& fusedNode : fusedWith) {
+        if (fusedNode->getType() == Eltwise) {
+            for (int i = 1; i < fusedNode->getCnnLayer()->insData.size(); i++) {
+                inputPrecisions.push_back(fusedNode->getCnnLayer()->insData[i].lock()->getPrecision());
+            }
+        }
+    }
 
-    sum_scales.clear();
-    for (int i = 0; i < getParentEdges().size(); i++)
-        sum_scales.push_back(with_coeffs ? eltwiseLayer->coeff[i] : 1.0f);
-}
+    if (inputPrecisions.size() != getParentEdges().size())
+        THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration.";
 
-void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
-    if (!supportedPrimitiveDescriptors.empty())
-        return;
+    InferenceEngine::Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();
+    if (!fusedWith.empty()) {
+        auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
+        if (lastFusedLayer) {
+            outputPrecision = lastFusedLayer->outData[0]->getPrecision();
+        }
+    }
 
-    setPostOps(attr, true);
+    if (!mayiuse(avx512_core_bf16)) {
+        bool hasBF16 = false;
+        for (auto &inPrc : inputPrecisions)
+            if (inPrc == Precision::BF16)
+                hasBF16 = true;
 
-    auto initDesc = [&] (mkldnn::memory::data_type inputDT, mkldnn::memory::data_type outputDT, memory::format format) -> PrimitiveDescInfo {
-        InferenceEngine::LayerConfig config;
-        impl_desc_type impl_type = impl_desc_type::ref;
-        config.dynBatchSupport = true;
-        for (size_t i = 0; i < getParentEdges().size(); i++) {
-            InferenceEngine::DataConfig dataConfig;
-            dataConfig.inPlace = (!i && canBeInPlace()) ? 0 : -1;
-            dataConfig.constant = false;
+        if (outputPrecision == Precision::BF16 || hasBF16)
+            THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support BF16 precision on this target.";
+    }
 
-            if (!broadcast) {
-                dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format);
-                config.inConfs.push_back(dataConfig);
+    auto filterPrecision = [&](Precision& prc) {
+        if (!canUseOptimizedImpl) {
+            return Precision(Precision::FP32);
+        } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) {
+            if (prc == Precision::U32 || prc == Precision::I64 || prc == Precision::U64) {
+                return Precision(Precision::I32);
             } else {
-                // Broadcasting support
-                if (MKLDNNMemory::IsPlainFormat(format)) {
-                    dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT,
-                            MKLDNNMemory::GetPlainFormat(getParentEdgeAt(i)->getDims()));
-                    config.inConfs.push_back(dataConfig);
-                } else {
-                    // Unsupported format for broadcast mode. Should be skipped.
-                    // Will mark it as undef and outer code should filter it.
-                    impl_type = impl_desc_type::undef;
-                }
+                THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support " << prc << " precision.";
             }
+        } else {
+            return prc;
         }
-
-        InferenceEngine::DataConfig dataConfig;
-            dataConfig.inPlace = -1;
-            dataConfig.constant = false;
-            dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format);
-            config.outConfs.push_back(dataConfig);
-        return {config, impl_type, format};
     };
 
-    if (fusedWith.empty()) {
-        for (const auto& format : getAvailableFormatsForDims(getChildEdgeAt(0)->getDims())) {
-            // Precision of implementation is defined by precision of output tensor
-            auto prec = getCnnLayer()->outData[0]->getPrecision();
-            mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(prec);
-            mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(prec);
-
-            // Eltwise compare operation can have the input type different from the output type
-            auto node_op = this->op;
-            bool is_eltwise_compare_node = ((node_op == EltwiseLayer::eOperation::Equal) ||
-                                            (node_op == EltwiseLayer::eOperation::Not_equal) ||
-                                            (node_op == EltwiseLayer::eOperation::Greater) ||
-                                            (node_op == EltwiseLayer::eOperation::Greater_equal) ||
-                                            (node_op == EltwiseLayer::eOperation::Less) ||
-                                            (node_op == EltwiseLayer::eOperation::Less_equal));
-            if (is_eltwise_compare_node) {
-                auto in_prec = getCnnLayer()->insData[0].lock()->getPrecision();
-                inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(in_prec);
-            }
+    for (int i = 0; i < inputPrecisions.size(); i++) {
+        inputPrecisions[i] = filterPrecision(inputPrecisions[i]);
+    }
+    outputPrecision = filterPrecision(outputPrecision);
 
-            if (inputDT == memory::bf16 || outputDT == memory::bf16) {
-                inputDT = memory::f32;
-                outputDT = memory::f32;
-            }
+    // TODO: delete after new LPT (ngraph based) is merged
+    // WA is needed to handle bug in LPT that produces wrong precision after average pooling (I8/U8 instead of FP32)
+    if (eltwiseOp == MulAdd && (inputPrecisions[0] == Precision::U8 || inputPrecisions[0] == Precision::I8)) {
+        auto poolingLayer = dynamic_cast<PoolingLayer*>(getParentEdgesAtPort(0)[0]->getParent()->getCnnLayer().get());
+        if (poolingLayer && poolingLayer->_type == PoolingLayer::AVG) {
+            inputPrecisions[0] = Precision::FP32;
+        }
+    }
+
+    enum LayoutType {
+        Planar,
+        ChannelsFirst,
+        Blocked
+    };
 
-            auto impl_desc = initDesc(inputDT, outputDT, format);
+    auto initDesc = [&] (LayoutType lt) -> PrimitiveDescInfo {
+        auto createMemoryDesc = [lt](MKLDNNEdgePtr edge, Precision prc, size_t offset) -> TensorDesc {
+            if (lt == ChannelsFirst) {
+                std::vector<size_t> blocks = edge->getDims().ToSizeVector();
+                std::vector<size_t> order;
+                order.push_back(0);
+                for (size_t j = 2; j < blocks.size(); j++)
+                    order.push_back(j);
+                if (blocks.size() > 1)
+                    order.push_back(1);
+
+                return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+            } else if (lt == Blocked && edge->getDims()[1] != 1) {
+                size_t blockSize = mayiuse(cpu::avx512_common) ? 16 : 8;
+
+                std::vector<size_t> blocks = edge->getDims().ToSizeVector();
+                std::vector<size_t> order(blocks.size());
+                for (size_t j = 0; j < order.size(); j++)
+                    order[j] = j;
+
+                blocks[1] = div_up(blocks[1], blockSize);
+                blocks.push_back(blockSize);
+                order.push_back(1);
+
+                return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+            } else {
+                std::vector<size_t> blocks = edge->getDims().ToSizeVector();
+                std::vector<size_t> order(blocks.size());
+                for (size_t j = 0; j < order.size(); j++)
+                    order[j] = j;
 
-            if (impl_desc.getImplementationType() != impl_desc_type::undef) {
-                supportedPrimitiveDescriptors.push_back(impl_desc);
+                return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
             }
-        }
-    } else {
-        auto ndims = getCnnLayer()->outData[0]->getDims().size();
-        auto format = ndims == 2 ? memory::format::nc :
-                      ndims == 4 ? memory::format::nhwc :
-                      memory::format::ndhwc;
+        };
 
+        size_t offset = std::numeric_limits<size_t>::max();
         InferenceEngine::LayerConfig config;
-        impl_desc_type impl_type = impl_desc_type::ref;
-        config.dynBatchSupport = true;
+        config.dynBatchSupport = getChildEdgeAt(0)->getDims().ndims() > 1 && getChildEdgeAt(0)->getDims() == getParentEdgeAt(0)->getDims();
+
         for (size_t i = 0; i < getParentEdges().size(); i++) {
             InferenceEngine::DataConfig dataConfig;
-            dataConfig.inPlace = -1;
+            dataConfig.inPlace = (!i && canBeInPlace() && inputPrecisions[i] == outputPrecision) ? 0 : -1;
             dataConfig.constant = false;
-            auto inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(
-                    getCnnLayer()->insData[i].lock()->getPrecision());
-            dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format);
-            config.inConfs.push_back(dataConfig);
-        }
 
-        auto outputDT = memory::f32;
-        auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
-        if (lastFusedLayer) {
-            outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(lastFusedLayer->outData[0]->getPrecision());
+
+            dataConfig.desc = createMemoryDesc(getParentEdgeAt(i), inputPrecisions[i], offset);
+
+            config.inConfs.push_back(dataConfig);
         }
 
         InferenceEngine::DataConfig dataConfig;
         dataConfig.inPlace = -1;
         dataConfig.constant = false;
-        dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format);
-        config.outConfs.push_back(dataConfig);
 
-        supportedPrimitiveDescriptors.push_back({config, impl_type, format});
+        dataConfig.desc = createMemoryDesc(getChildEdgeAt(0), outputPrecision, offset);
 
-        jep.src0_step = config.inConfs[0].desc.getDims()[1] == 1 ? 0 : 1;
-        jep.src1_step = config.inConfs[1].desc.getDims()[1] == 1 ? 0 : 1;
-        jep.dst_step = 1;
-        jep.src0_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.inConfs[0].desc.getPrecision());
-        jep.src1_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.inConfs[1].desc.getPrecision());
-        jep.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.outConfs[0].desc.getPrecision());
-        jep.src0_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.src0_dt);
-        jep.src1_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.src1_dt);
-        jep.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.dst_dt);
-        jep.eltwise_op = op;
+        config.outConfs.push_back(dataConfig);
 
+        impl_desc_type impl_type;
         if (mayiuse(cpu::avx512_common)) {
-            eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic<cpu::avx512_common>(jep, *attr.get()));
+            impl_type = impl_desc_type::jit_avx512;
         } else if (mayiuse(cpu::avx2)) {
-            eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic<cpu::avx2>(jep, *attr.get()));
+            impl_type = impl_desc_type::jit_avx2;
         } else if (mayiuse(cpu::sse42)) {
-            eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic<cpu::sse42>(jep, *attr.get()));
+            impl_type = impl_desc_type::jit_sse42;
+        } else {
+            impl_type = impl_desc_type::ref;
         }
+
+        return {config, impl_type, MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat()};
+    };
+
+    bool isChannelsFirstApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 1, 2, 4, 5);
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
+        isChannelsFirstApplicable = isChannelsFirstApplicable && one_of(getParentEdgeAt(i)->getDims().ndims(), 1, 2, 4, 5);
+        isChannelsFirstApplicable = isChannelsFirstApplicable && getChildEdgeAt(0)->getDims().ndims() == getParentEdgeAt(i)->getDims().ndims();
+    }
+
+    bool isBlockedApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 4, 5);
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
+        isBlockedApplicable = isBlockedApplicable && one_of(getParentEdgeAt(i)->getDims().ndims(), 4, 5);
+        isBlockedApplicable = isBlockedApplicable && getChildEdgeAt(0)->getDims().ndims() == getParentEdgeAt(i)->getDims().ndims();
     }
+
+    if (isChannelsFirstApplicable)
+        supportedPrimitiveDescriptors.emplace_back(initDesc(ChannelsFirst));
+    if (isBlockedApplicable)
+        supportedPrimitiveDescriptors.emplace_back(initDesc(Blocked));
+    supportedPrimitiveDescriptors.emplace_back(initDesc(Planar));
 }
 
 void MKLDNNEltwiseNode::createPrimitive() {
-    if (prim)
-        return;
+    auto config = getSelectedPrimitiveDescriptor()->getConfig();
 
-    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
-        THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
-    if (getSelectedPrimitiveDescriptor() == nullptr)
-        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
+    auto initDims = [this, config](size_t maxInputSize) {
+        size_t inputNum = getParentEdges().size();
 
-    std::vector<memory::primitive_desc> srcs_pd;
-    std::vector<primitive::at> srcs_p;
-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
-        if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) {
-            auto parent = getParentEdgeAt(i)->getParent();
-            THROW_IE_EXCEPTION << "Source memory from " << parent->getName() << " didn't allocate.";
+        dims_in.resize(inputNum);
+        for (int i = 0; i < inputNum; i++) {
+            dims_in[i].resize(maxInputSize, 1);
         }
 
-        if (op == EltwiseLayer::Sum) {
-            srcs_pd.push_back(srcMemPtr->GetPrimitiveDescriptor());
-            srcs_p.emplace_back(srcMemPtr->GetPrimitive());
+        dims_out.resize(maxInputSize, 1);
+
+        std::vector<size_t> order(maxInputSize);
+        auto outOrder = config.outConfs[0].desc.getBlockingDesc().getOrder();
+        for (size_t i = 0; i < order.size(); i++) {
+            if (i < order.size() - outOrder.size())
+                order[i] = i;
+            else
+                order[i] = outOrder[i - (order.size() - outOrder.size())] + (order.size() - outOrder.size());
         }
-    }
-    if (op == EltwiseLayer::Sum && !broadcast && fusedWith.empty()) {
-        try {
-            auto primitive_desc = mkldnn::sum::primitive_desc(dstMemPtr->GetDescriptor(), sum_scales, srcs_pd);
-            prim = std::shared_ptr<mkldnn::sum>(new mkldnn::sum(primitive_desc, srcs_p, dstMemPtr->GetPrimitive()));
-        } catch (...) {
-            std::cerr << "Handle this problem correctly!" << std::endl;
-            prim = nullptr;
+
+        size_t outRank = config.outConfs[0].desc.getBlockingDesc().getBlockDims().size();
+        for (int i = 0; i < outRank; i++) {
+            dims_out[dims_out.size() - 1 - i] = config.outConfs[0].desc.getBlockingDesc().getBlockDims()[outRank - 1 - i];
         }
-    }
-}
 
-void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() {
-    auto selected_pd = getSelectedPrimitiveDescriptor();
-    if (selected_pd == nullptr)
-        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
-    auto config = selected_pd->getConfig();
-    if (isInitConfig(config))
-        return;
+        for (int i = 0; i < inputNum; i++) {
+            size_t inRank = config.inConfs[i].desc.getBlockingDesc().getBlockDims().size();
 
-    MKLDNNNode::initOptimalPrimitiveDescriptor();
+            // WA to normalize blocked and planar layouts
+            auto inOrder = config.inConfs[i].desc.getBlockingDesc().getOrder();
+            size_t startOff = outOrder.size() != config.outConfs[0].desc.getDims().size() &&
+                              outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] ? 1 : 0;
 
-    auto* selectedPD = getSelectedPrimitiveDescriptor();
-    if (!selectedPD) {
-        return;
-    }
+            for (int j = 0; j < inRank; j++) {
+                dims_in[i][dims_in[i].size() - 1 - j - startOff] = config.inConfs[i].desc.getBlockingDesc().getBlockDims()[inRank - 1 - j];
+            }
+        }
 
-    auto& selectedConfig = getSelectedPrimitiveDescriptor()->getConfig();
-    for (size_t i = 1; i < selectedConfig.inConfs.size(); i++) {
-        if (selectedConfig.inConfs[0].desc.getPrecision() != selectedConfig.inConfs[i].desc.getPrecision()) {
-            selectedConfig.inConfs[i].desc.setPrecision(selectedConfig.inConfs[0].desc.getPrecision());
+        for (int i = 0; i < dims_in.size(); i++) {
+            for (int j = 0; j < dims_in[i].size(); j++) {
+                if (dims_in[i][j] != dims_out[j] && dims_in[i][j] != 1)
+                    THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input/output dims configuration.";
+            }
         }
-    }
-}
+    };
 
-void MKLDNNEltwiseNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {
-    mkldnn::post_ops ops;
+    auto initOffsets = [this, config](size_t maxInputSize) {
+        size_t inputNum = getParentEdges().size();
 
-    for (auto &node : fusedWith) {
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
+        offsets_out.resize(maxInputSize, 1);
+        offset_out_calc(offsets_out, dims_out);
+        for (int j = 0; j < maxInputSize; j++) {
+            offsets_out[j] *= config.outConfs[0].desc.getPrecision().size();
+        }
 
-            continue;
+        offsets_in.resize(inputNum);
+        for (int i = 0; i < inputNum; i++) {
+            offsets_in[i].resize(maxInputSize, 1);
+            offset_in_calc(offsets_in[i], dims_in[i], dims_out);
+            for (int j = 0; j < maxInputSize; j++) {
+                offsets_in[i][j] *= config.inConfs[i].desc.getPrecision().size();
+            }
         }
 
-        auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
-        if (quantizeNode) {
-            quantizeNode->appendPostOps(ops);
-            continue;
+        start_offset_in.resize(inputNum);
+        for (size_t i = 0; i < inputNum; i++) {
+            start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
+                               MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getParentEdgeAt(i)->getMemory().GetDescriptor().data.data_type));
         }
+        start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
+                         MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getChildEdgeAt(0)->getMemory().GetDescriptor().data.data_type));
+    };
 
-        THROW_IE_EXCEPTION << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
-    }
+    auto collapseLastDims = [](std::vector<size_t>& dims, int dimsToCollapse) {
+        for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
+            dims[dims.size() - 1] *= dims[i];
+        }
 
-    attr.set_post_ops(ops);
-}
+        for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
+            dims[i] = dims[i - dimsToCollapse];
+        }
 
-void MKLDNNEltwiseNode::dims_calc(int *dims, const MKLDNNDims &edge_dims, bool channels_first = false) {
-    for (int i = 0; i < 5; i++)
-        dims[i] = 1;
-    int ndims = edge_dims.ndims();
-    if (ndims > 5) {
-        THROW_IE_EXCEPTION << "ndims should be less then 5";
-    }
-    for (int i = 0; i < ndims; i++) {
-        dims[4 - i] = edge_dims[ndims - 1 - i];
-    }
-    if (edge_dims.ndims() && !(broadcast && edge_dims[0] == getChildEdgeAt(0)->getDims()[0]))
-        dims[batch_dim] = std::min(dims[batch_dim], batchToProcess());
-
-    if (channels_first) {
-        auto ch_idx = 5 - ndims + 1;
-        auto ch = dims[ch_idx];
-        for (int i = ch_idx; i < 4; i++) {
-            dims[i] = dims[i + 1];
+        for (int i = dimsToCollapse - 1; i >= 0; i--) {
+            dims[i] = 1;
         }
-        dims[4] = ch;
-    }
-}
+    };
 
-void MKLDNNEltwiseNode::offset_out_calc(int *offset, int *dims) {
-    int k = 1;
-    for (int i = 4; i >= 0; i--) {
-        offset[i] = k;
-        k *= dims[i];
-    }
-}
+    auto collapseLastOffsets = [](std::vector<size_t>& dims, int dimsToCollapse) {
+        for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
+            if (dims[dims.size() - 1] > 0 || dims[i] > 0)
+                dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast<size_t>(1)) * std::max(dims[i], static_cast<size_t>(1));
+            else
+                dims[dims.size() - 1] *= dims[i];
+        }
 
-void MKLDNNEltwiseNode::offset_in_calc(int *offset, int *dims_in, int *dims_out) {
-    int k = 1;
-    for (int i = 4; i >= 0; i--) {
-        offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
-        k *= dims_in[i];
-    }
-}
+        for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
+            dims[i] = dims[i - dimsToCollapse];
+        }
 
-// Intel C++ Compiler 18.0 for Windows contains bug that doesn't allow to use templates to generate eltwise implementations
-// and to avoid all copypaste below
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_add(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] + src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] + src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in];
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in];
-                }
-            });
-#endif
+        for (int i = dimsToCollapse - 1; i >= 0; i--) {
+            dims[i] = 0;
         }
-    }
-}
+    };
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_prod(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] * src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] * src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] * src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] * src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in];
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in];
-            });
-#endif
-        }
-    }
-}
+    tensorRank = std::max(static_cast<size_t>(optimalTensorRank), config.outConfs[0].desc.getBlockingDesc().getBlockDims().size());
+    initDims(tensorRank);
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_max(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]);
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]);
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]);
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]);
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
-                    }
-                }
+    auto outOrder = config.outConfs[0].desc.getBlockingDesc().getOrder();
+    size_t oc_size = 0;
+    offsets_oc.resize(tensorRank, 0);
+    if (isFusedWith(Quantize)) {
+        size_t offset_oc = 1;
+        for (int i = outOrder.size() - 1; i >= 0; i--) {
+            if (outOrder[i] == 1) {
+                int oc_dim_idx = i + (tensorRank - outOrder.size());
+                offsets_oc[oc_dim_idx] = offset_oc;
+                offset_oc *= dims_out[oc_dim_idx];
             }
         }
+        oc_size = offsets_oc[dims_out.size() - 1] != 0 ? dims_out[dims_out.size() - 1] : 1;
     }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]);
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]);
-                }
-            });
-#endif
-        }
-    }
-}
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_sub(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] - src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] - src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] - src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] - src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
+    fullWorkAmount = 1;
+    for (int i = 0; i < dims_out.size(); i++) {
+        fullWorkAmount *= dims_out[i];
     }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in];
-                        }
-                    }
+
+    size_t minimalConcurrency = parallel_get_max_threads();
+    size_t minimalJitWorkAmount = 256;
+    size_t currentJitWorkAmount = dims_out[dims_out.size() - 1];
+    int collapsedDims = 0;
+    if (canUseOptimizedImpl) {
+        bool hasDifferentDims = false;
+        while (currentJitWorkAmount < minimalJitWorkAmount) {
+            if (dims_out.size() - collapsedDims - 2 < 0)
+                break;
+
+            for (int j = 1; j < dims_in.size(); j++) {
+                if (dims_in[j][dims_in[j].size() - 1] != dims_in[0][dims_in[0].size() - 1]) {
+                    hasDifferentDims = true;
                 }
             }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in];
-                }
-            });
-#endif
-        }
-    }
-}
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_min(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]);
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]);
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]);
+            if (oc_size > 1 && oc_size != dims_in[0][dims_in[0].size() - 1]) {
+                hasDifferentDims = true;
             }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]);
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
+
+            bool canCollapse = true;
+            for (int i = 0; i < dims_in.size(); i++) {
+                if (dims_in[i][dims_in[i].size() - 2] != 1) {
+                    if (dims_in[i][dims_in[i].size() - 1] == 1) {
+                        canCollapse = false;
+                        break;
                     }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]);
-                        }
+
+                    if (hasDifferentDims) {
+                        canCollapse = false;
+                        break;
                     }
                 }
             }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]);
-                }
-            });
-#endif
-        }
-    }
-}
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_div(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] / src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] / src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] / src_ptr[i];
+            if (!canCollapse) {
+                break;
             }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] / src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1];
-                    }
+
+            size_t nextJitWorkAmount = currentJitWorkAmount * dims_out[dims_out.size() - 2];
+            if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
+                currentJitWorkAmount = nextJitWorkAmount;
+                collapsedDims++;
+
+                for (int i = 0; i < dims_in.size(); i++) {
+                    collapseLastDims(dims_in[i], 1);
                 }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in];
-                        }
-                    }
+                collapseLastDims(dims_out, 1);
+
+                if (isFusedWith(Quantize)) {
+                    collapseLastOffsets(offsets_oc, 1);
                 }
+            } else {
+                break;
             }
         }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in];
-                }
-            });
-#endif
-        }
     }
-}
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_squared_diff(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]);
+    isDynBatchEnabled = config.dynBatchSupport;
+    batchDimIdx = tensorRank - config.outConfs[0].desc.getBlockingDesc().getBlockDims().size() + collapsedDims;
+    schedulerWorkAmount = fullWorkAmount / dims_out[dims_out.size() - 1];
+
+    initOffsets(tensorRank);
+
+    jep.inputs_number = config.inConfs.size();
+    jep.input_size = tensorRank;
+
+    for (int i = 0; i < config.inConfs.size(); i++) {
+        jep.src_size[i] = dims_in[i][dims_in[i].size() - 1];
+        jep.src_prc[i] = config.inConfs[i].desc.getPrecision();
     }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]);
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]);
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]);
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]);
-                    }
-                }
-            }
-        }
+    jep.dst_size = dims_out[dims_out.size() - 1];
+    jep.dst_prc = config.outConfs[0].desc.getPrecision();
+
+    for (int i = 0; i < config.inConfs.size(); i++) {
+        jep.src_offsets[i] = offsets_in[i];
     }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]);
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]);
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]);
-                }
-            });
-#endif
-        }
+    jep.dst_offsets = offsets_out;
+
+    jep.oc_size = oc_size;
+
+    if (mayiuse(cpu::avx512_common)) {
+        eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx512_common>(jep, *this));
+    } else if (mayiuse(cpu::avx2)) {
+        eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx2>(jep, *this));
+    } else if (mayiuse(cpu::sse42)) {
+        eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::sse42>(jep, *this));
     }
 }
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_floor_mod(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in0] / src1_ptr[index_in1] * src1_ptr[index_in1];
+void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() {
+    for (auto& type : getPrimitivesPriority()) {
+        int selectedPrimitive = -1;
+        int equalsFormatCount = -1;
+        for (size_t i = 0; i < getSupportedPrimitiveDescriptors().size(); i++) {
+            impl_desc_type supportedType = getSupportedPrimitiveDescriptors()[i].getImplementationType();
+            if (type == supportedType) {
+                int equalsLocalFormatCount = 0;
+                if (getSupportedPrimitiveDescriptors()[i].getConfig().inConfs.size() > getParentEdges().size())
+                    continue;
+                for (size_t j = 0; j < getSupportedPrimitiveDescriptors()[i].getConfig().inConfs.size(); j++) {
+                    auto parentEdge = getParentEdgeAt(j);
+                    auto parentPtr = parentEdge->getParent();
+                    // We don't take into account constant edges since reorders on them will be executed on load network stage
+                    if (j > 0 && parentPtr->isConstant()) {
+                        equalsLocalFormatCount++;
+                        continue;
                     }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in0] / src1_ptr[index_in1] * src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_out] / src_ptr[index_in] * src_ptr[index_in];
+
+                    auto parent_spd = parentPtr->getSelectedPrimitiveDescriptor();
+
+                    if (parent_spd != nullptr && !parent_spd->getConfig().outConfs.empty()) {
+                        int inNum = parentEdge->getInputNum();
+                        if (inNum < 0 || inNum >= parent_spd->getConfig().outConfs.size()) {
+                            inNum = 0;
+                        }
+                        if (MKLDNNExtensionUtils::initTensorsAreEqual(
+                                getSupportedPrimitiveDescriptors()[i].getConfig().inConfs[j].desc,
+                                parent_spd->getConfig().outConfs[inNum].desc)) {
+                            equalsLocalFormatCount++;
                         }
                     }
                 }
+                if (equalsLocalFormatCount > equalsFormatCount) {
+                    equalsFormatCount = equalsLocalFormatCount;
+                    selectedPrimitive = static_cast<int>(i);
+                }
             }
         }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_out] / src_ptr[index_in] * src_ptr[index_in];
-                }
-            });
-#endif
+        if (selectedPrimitive >= 0) {
+            selectPrimitiveDescriptorByIndex(selectedPrimitive);
+            return;
         }
     }
+
+    if (getSupportedPrimitiveDescriptors().empty())
+        THROW_IE_EXCEPTION << "Supported primitive descriptors list is empty for node: " << getName();
+    // fallback. If there are no primitives from priority list just select a first
+    selectPrimitiveDescriptorByIndex(0);
 }
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_pow(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]);
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]);
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]);
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]);
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]);
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]);
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]);
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]);
-                }
-            });
-#endif
-        }
+void MKLDNNEltwiseNode::offset_out_calc(std::vector<size_t>& offset, std::vector<size_t>& dims) {
+    int k = 1;
+    for (int i = offset.size() - 1; i >= 0; i--) {
+        offset[i] = k;
+        k *= dims[i];
     }
 }
 
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_equal(
-        const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] == src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] == src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] == src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] == src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in];
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in];
-                }
-            });
-#endif
-        }
+void MKLDNNEltwiseNode::offset_in_calc(std::vector<size_t>& offset, std::vector<size_t>& dims_in, std::vector<size_t>& dims_out) {
+    int k = 1;
+    for (int i = offset.size() - 1; i >= 0; i--) {
+        offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
+        k *= dims_in[i];
     }
 }
 
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_not_equal(
-        const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] != src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] != src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] != src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] != src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1];
-                    }
-                }
+void MKLDNNEltwiseNode::executeOptimized6D(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr) {
+    size_t inputNum = src_ptrs.size();
+
+    parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4],
+        [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) {
+            // TODO: reimplement initializer via jit approach
+            size_t index_in[MAX_ELTWISE_INPUTS] = {0};
+            for (int i = 0; i < inputNum; i++) {
+                index_in[i] = i0 * offsets_in[i][0] + i1 * offsets_in[i][1] + i2 * offsets_in[i][2] +
+                              i3 * offsets_in[i][3] + i4 * offsets_in[i][4];
             }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1];
+            size_t index_out = i0 * offsets_out[0] + i1 * offsets_out[1] + i2 * offsets_out[2] +
+                               i3 * offsets_out[3] + i4 * offsets_out[4];
+
+            auto arg = jit_eltwise_call_args();
+            for (int i = 0; i < inputNum; i++) {
+                arg.src_ptr[i] = src_ptrs[i] + index_in[i];
             }
+            arg.dst = dst_ptr + index_out;
+            arg.work_amount = static_cast<size_t>(dims_out[dims_out.size() - 1]);
+            arg.oc_off = (i0 * offsets_oc[0] + i1 * offsets_oc[1] + i2 * offsets_oc[2] +
+                          i3 * offsets_oc[3] + i4 * offsets_oc[4]) * sizeof(float);
+
+            (*eltwise_kernel)(&arg);
         });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in];
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in];
-                }
-            });
-#endif
-        }
-    }
 }
 
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_less(
-        const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] < src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] < src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] < src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] < src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1];
+void MKLDNNEltwiseNode::executeOptimizedGeneric(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr) {
+    size_t inputNum = src_ptrs.size();
+
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        splitter(schedulerWorkAmount, nthr, ithr, start, end);
+
+        std::vector<size_t> counters(dims_out.size() - 1, 0);
+
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            size_t tmp = iwork;
+            for (ptrdiff_t j = dims_out.size() - 2; j >= 0; j--) {
+                counters[j] = tmp % dims_out[j];
+                tmp /= dims_out[j];
             }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in];
-                        }
-                    }
+
+            size_t index_in[MAX_ELTWISE_INPUTS] = {0};
+            for (int i = 0; i < inputNum; i++) {
+                index_in[i] = 0;
+                for (int j = 0; j < counters.size(); j++) {
+                    index_in[i] += counters[j] * offsets_in[i][j];
                 }
             }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in];
-                }
-            });
-#endif
-        }
-    }
-}
 
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_less_equal(
-        const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] <= src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] <= src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] <= src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] <= src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1];
-                    }
-                }
+            size_t index_out = 0;
+            for (int j = 0; j < counters.size(); j++) {
+                index_out += counters[j] * offsets_out[j];
             }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1];
+
+            auto arg = jit_eltwise_call_args();
+            for (int i = 0; i < inputNum; i++) {
+                arg.src_ptr[i] = src_ptrs[i] + index_in[i];
             }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in];
-                        }
-                    }
-                }
+            arg.dst = dst_ptr + index_out;
+            arg.work_amount = static_cast<size_t>(dims_out[dims_out.size() - 1]);
+
+            arg.oc_off = 0;
+            for (int j = 0; j < counters.size(); j++) {
+                arg.oc_off += counters[j] * offsets_oc[j] * sizeof(float);
             }
+
+            (*eltwise_kernel)(&arg);
         }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in];
-                }
-            });
-#endif
-        }
-    }
+    });
 }
 
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_greater(
-        const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] > src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] > src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] > src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] > src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
+void MKLDNNEltwiseNode::executeReference(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr) {
+    size_t inputNum = src_ptrs.size();
+
+    std::shared_ptr<ref_eltwise_scalar_fwd_t> ref_eltwise_injector = nullptr;
+    if (eltwiseAlgorithm != mkldnn::algorithm_undef) {
+        ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(static_cast<mkldnn_alg_kind_t>(eltwiseAlgorithm), alpha, beta);
     }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1];
+
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        splitter(fullWorkAmount, nthr, ithr, start, end);
+
+        std::vector<size_t> counters(dims_out.size(), 0);
+
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            size_t tmp = iwork;
+            for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) {
+                counters[j] = tmp % dims_out[j];
+                tmp /= dims_out[j];
             }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in];
-                        }
-                    }
+
+            size_t index_in[MAX_ELTWISE_INPUTS] = {0};
+            for (int i = 0; i < inputNum; i++) {
+                index_in[i] = 0;
+                for (int j = 0; j < counters.size(); j++) {
+                    index_in[i] += counters[j] * offsets_in[i][j];
                 }
             }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in];
-                }
-            });
-#endif
-        }
-    }
+
+            size_t index_out = 0;
+            for (int j = 0; j < counters.size(); j++) {
+                index_out += counters[j] * offsets_out[j];
+            }
+
+            std::vector<float> src_f(inputNum);
+            for (int i = 0; i < inputNum; i++) {
+                src_f[i] = reinterpret_cast<const float *>(src_ptrs[i] + index_in[i])[0];
+            }
+            float* dst_ptr_f = reinterpret_cast<float *>(dst_ptr + index_out);
+
+            switch (getOpType()) {
+                case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt:
+                case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+                    *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); break;
+                case Add:               *dst_ptr_f = src_f[0] + src_f[1]; break;
+                case MulAdd:            *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break;
+                case Subtract:          *dst_ptr_f = src_f[0] - src_f[1]; break;
+                case Multiply:          *dst_ptr_f = src_f[0] * src_f[1]; break;
+                case Divide:            *dst_ptr_f = src_f[0] / src_f[1]; break;
+                case FloorMod:          *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break;
+                case Mod:               *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break;
+                case Maximum:           *dst_ptr_f = std::max(src_f[0], src_f[1]); break;
+                case Minimum:           *dst_ptr_f = std::min(src_f[0], src_f[1]); break;
+                case SquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break;
+                case PowerDynamic:      *dst_ptr_f = powf(src_f[0], src_f[1]); break;
+                case Equal:             *dst_ptr_f = src_f[0] == src_f[1]; break;
+                case NotEqual:          *dst_ptr_f = src_f[0] != src_f[1]; break;
+                case Greater:           *dst_ptr_f = src_f[0] > src_f[1]; break;
+                case GreaterEqual:      *dst_ptr_f = src_f[0] >= src_f[1]; break;
+                case Less:              *dst_ptr_f = src_f[0] < src_f[1]; break;
+                case LessEqual:         *dst_ptr_f = src_f[0] <= src_f[1]; break;
+                case LogicalAnd:        *dst_ptr_f = src_f[0] && src_f[1]; break;
+                case LogicalOr:         *dst_ptr_f = src_f[0] || src_f[1]; break;
+                case LogicalXor:        *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break;
+                case LogicalNot:        *dst_ptr_f = !src_f[0]; break;
+                case PowerStatic:       *dst_ptr_f = powf(beta * src_f[0] + gamma, alpha); break;
+                case Prelu:             *dst_ptr_f = src_f[0] > 0 ? src_f[0] : src_f[0] * src_f[1]; break;
+                default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node with name `" << getName() << "`";
+            }
+        }
+    });
 }
 
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_greater_equal(
-        const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] >= src1_ptr[i];
+void MKLDNNEltwiseNode::execute(mkldnn::stream strm) {
+    size_t inputNum = getParentEdges().size();
+
+    std::vector<const uint8_t *> src_ptrs(inputNum);
+    for (int i = 0; i < inputNum; i++) {
+        src_ptrs[i] = reinterpret_cast<const uint8_t*>(getParentEdgeAt(i)->getMemory().GetData()) + start_offset_in[i];
     }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] >= src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] >= src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] >= src_ptr[i];
-            });
-#endif
+    uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) + start_offset_out;
+
+    // In general case we need to recompute offsets as well but currently all supported layout assumes batch to be outermost dimension
+    if (isDynBatchEnabled)
+        dims_out[batchDimIdx] = static_cast<size_t>(batchToProcess());
+
+    if (eltwise_kernel) {
+        if (tensorRank == optimalTensorRank) {
+            executeOptimized6D(src_ptrs, dst_ptr);
+        } else {
+            executeOptimizedGeneric(src_ptrs, dst_ptr);
         }
     } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in];
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in];
-                }
-            });
-#endif
-        }
+        executeReference(src_ptrs, dst_ptr);
     }
 }
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_logical_and(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] && src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] && src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] && src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] && src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in];
-                        }
-                    }
-                }
-            }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in];
-                }
-            });
-#endif
-        }
-    }
+bool MKLDNNEltwiseNode::created() const {
+    return getType() == Eltwise;
 }
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_logical_or(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = src0_ptr[i] || src1_ptr[i];
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = src0_ptr[i] || src1_ptr[i];
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = dst_ptr[i] || src_ptr[i];
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = dst_ptr[i] || src_ptr[i];
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1];
-                    }
-                }
-            }
-        }
+bool MKLDNNEltwiseNode::canBeInPlace() const {
+    if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Input) {
+        return false;
     }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1];
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in];
-                        }
-                    }
-                }
+
+    for (auto& parentEdge : getParentEdges()) {
+        auto parent = parentEdge.lock()->getParent();
+        if (parent->getChildEdges().size() != 1)
+            return false;
+
+        // WA to prevent memory corruption caused by inplace feature
+        if (parent->getType() == Concatenation) {
+            for (auto& parentParentEdge : parent->getParentEdges()) {
+                auto parentParent = parentParentEdge.lock()->getParent();
+                if (parentParent->getChildEdges().size() != 1)
+                    return false;
             }
         }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in];
-                }
-            });
-#endif
-        }
     }
+
+    return getParentEdgesAtPort(0)[0].get()->getDims() == getChildEdgesAtPort(0)[0].get()->getDims();
 }
 
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_logical_xor(
-        const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
-    if (!broadcast) {
-#ifdef _WIN32
-        for (size_t i = 0; i < dst_data_size; i++) {
-            dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]);
-    }
-#else
-        parallel_for(dst_data_size, [&](size_t i) {
-            dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]);
-        });
-#endif
-        for (int j = 2; j < getParentEdges().size(); j++) {
-            const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
-                                getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
-            for (size_t i = 0; i < dst_data_size; i++) {
-                dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]);
-            }
-#else
-            parallel_for(dst_data_size, [&](size_t i) {
-                dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]);
-            });
-#endif
-        }
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims);
-        dims_calc(dims_in0, parent0_edge_dims);
-        dims_calc(dims_in1, parent1_edge_dims);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-        for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-        for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-            for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                    for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                        size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                        size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                        size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                        dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]);
-                    }
+void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) {
+    switch (getAlgorithm()) {
+        case mkldnn::eltwise_relu:
+        case mkldnn::eltwise_tanh:
+        case mkldnn::eltwise_elu:
+        case mkldnn::eltwise_square:
+        case mkldnn::eltwise_abs:
+        case mkldnn::eltwise_sqrt:
+        case mkldnn::eltwise_linear:
+        case mkldnn::eltwise_bounded_relu:
+        case mkldnn::eltwise_soft_relu:
+        case mkldnn::eltwise_logistic:
+        case mkldnn::eltwise_exp:
+        case mkldnn::eltwise_gelu:
+        case mkldnn::eltwise_clamp:
+        case mkldnn::eltwise_swish:
+        case mkldnn::eltwise_hswish:
+        case mkldnn::eltwise_mish:
+        case mkldnn::eltwise_hsigmoid:
+            ops.append_eltwise(1.0, getAlgorithm(), getAlpha(), getBeta());
+            break;
+        case mkldnn::depthwise_scale_shift:
+        case mkldnn::depthwise_prelu:
+            if (scales.empty() && shifts.empty()) {
+                size_t bufferSize = static_cast<size_t>(outDims[0][outDims[0].size() > 1 ? 1 : 0]);
+                size_t bufferSizeAligned = rnd_up(bufferSize, 16);
+
+                Blob::Ptr scalesBlob = getCnnLayer()->blobs["weights"];
+                if (scalesBlob == nullptr)
+                    THROW_IE_EXCEPTION << "Cannot get weights blob in Eltwise node with name `" << getName() << "`";
+                scales.resize(bufferSizeAligned, 0);
+                const float *scalesBufferPtr = scalesBlob->buffer().as<float *>();
+                for (int i = 0; i < bufferSize; i++) {
+                    scales[i] = scalesBufferPtr[scalesBlob->size() == 1 ? 0 : i];
                 }
-            }
-        }
-    }
-#else
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
-                size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]);
-            }
-        });
-#endif
-        for (size_t n = 2; n < getParentEdges().size(); n++) {
-            const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
-                                getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-            auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
-            dims_calc(dims_in1, parent_edge_dims);
-            offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
-            for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
-            for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
-                for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
-                    for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
-                        for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
-                            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                            size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                            dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]);
-                        }
+
+                Blob::Ptr shiftsBlob = getCnnLayer()->blobs["biases"];
+                if (shiftsBlob != nullptr) {
+                    shifts.resize(bufferSizeAligned, 0);
+                    const float *shiftsBufferPtr = shiftsBlob->buffer().as<float *>();
+                    for (int i = 0; i < bufferSize; i++) {
+                        shifts[i] = shiftsBufferPtr[shiftsBlob->size() == 1 ? 0 : i];
                     }
                 }
             }
-        }
-#else
-            parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-                for (int i4 = 0; i4 < dims_out[4]; i4++) {
-                    size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
-                    size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
-                    dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]);
-                }
-            });
-#endif
-        }
-    }
-}
-
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::ref_eltwise2(int in0, int in1) {
-    IE_ASSERT(getParentEdges().size() > 1);
-
-    auto& srcMemory0 = getParentEdgeAt(in0)->getMemory();
-    auto& srcMemory1 = getParentEdgeAt(in1)->getMemory();
-    const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetData()) +
-        srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData()) +
-        srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    T2 *dst_ptr = reinterpret_cast<T2*>(getChildEdgeAt(0)->getMemory().GetData()) +
-        getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-    const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess();
-
-    switch (op) {
-        case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
-    }
-}
-
-template <typename T0, typename T1> void MKLDNNEltwiseNode::ref_eltwise(int in0, int in1) {
-    IE_ASSERT(getParentEdges().size() > 1);
-
-    auto& srcMemory0 = getParentEdgeAt(in0)->getMemory();
-    auto& srcMemory1 = getParentEdgeAt(in1)->getMemory();
-    const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetData()) +
-            srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData()) +
-            srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    T0 *dst_ptr = reinterpret_cast<T0*>(getChildEdgeAt(0)->getMemory().GetData()) +
-            getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-    const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess();
-
-    switch (op) {
-        case EltwiseLayer::eOperation::Sum: eltwise_add(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Prod: eltwise_prod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Max: eltwise_max(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Sub: eltwise_sub(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Min: eltwise_min(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Div: eltwise_div(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Squared_diff: eltwise_squared_diff(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Floor_mod: eltwise_floor_mod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Pow: eltwise_pow(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Logical_AND: eltwise_logical_and(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Logical_OR: eltwise_logical_or(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        case EltwiseLayer::eOperation::Logical_XOR: eltwise_logical_xor(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
-        default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
-    }
-}
 
-void MKLDNNEltwiseNode::jit_eltwise_fq() {
-    auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
-    auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
-    auto& dstMemory = getChildEdgeAt(0)->getMemory();
-
-    const uint8_t *src0_ptr = reinterpret_cast<const uint8_t*>(srcMemory0.GetData()) +
-        srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding *
-        MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemory0.GetDescriptor().data.data_type));
-    const uint8_t *src1_ptr = reinterpret_cast<const uint8_t*>(srcMemory1.GetData()) +
-        srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding *
-        MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemory1.GetDescriptor().data.data_type));
-    uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(dstMemory.GetData()) +
-        dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding *
-        MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemory.GetDescriptor().data.data_type));
-
-    if (!broadcast) {
-        auto& dims = getParentEdgeAt(0)->getDims();
-
-        int N = batchToProcess();
-        int C = dims[1];
-        int D = dims.ndims() > 4 ? dims[2] : 1;
-        int H = dims.ndims() > 2 ? dims[dims.ndims() - 2] : 1;
-        int W = dims.ndims() > 3 ? dims[dims.ndims() - 1] : 1;
-
-        parallel_for4d(N, D, H, W, [&](int n, int d, int h, int w) {
-            size_t off = n * D * H * W * C + d * H * W * C + h * W * C + w * C;
-
-            auto arg = jit_eltwise_fq_call_args();
-            arg.src0 = src0_ptr + off * jep.src0_data_size;
-            arg.src1 = src1_ptr + off * jep.src1_data_size;
-            arg.dst = dst_ptr + off * jep.dst_data_size;
-            arg.work_amount = static_cast<size_t>(C);
-
-            (*eltiwse_fq_kernel)(&arg);
-        });
-    } else {
-        int dims_out[5], dims_in0[5], dims_in1[5];
-        int offset_out[5], offset_in0[5], offset_in1[5];
-        auto& child_edge_dims = getChildEdgeAt(0)->getDims();
-        auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
-        auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
-        dims_calc(dims_out, child_edge_dims, true);
-        dims_calc(dims_in0, parent0_edge_dims, true);
-        dims_calc(dims_in1, parent1_edge_dims, true);
-        offset_out_calc(offset_out, dims_out);
-        offset_in_calc(offset_in0, dims_in0, dims_out);
-        offset_in_calc(offset_in1, dims_in1, dims_out);
-
-        parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
-            size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3];
-            size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3];
-            size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3];
-
-            auto arg = jit_eltwise_fq_call_args();
-            arg.src0 = src0_ptr + index_in0 * jep.src0_data_size;
-            arg.src1 = src1_ptr + index_in1 * jep.src1_data_size;
-            arg.dst = dst_ptr + index_out * jep.dst_data_size;
-            arg.work_amount = static_cast<size_t>(dims_out[4]);
-
-            (*eltiwse_fq_kernel)(&arg);
-        });
+            ops.append_depthwise(getAlgorithm(), &scales[0], shifts.empty() ? nullptr : &shifts[0]);
+            break;
+        default: THROW_IE_EXCEPTION << "Appending Eltwise node with name `" << getName() << "` as post operation is not supported";
     }
 }
 
-void MKLDNNEltwiseNode::execute(mkldnn::stream strm) {
-    if (prim) {
-        MKLDNNNode::execute(strm);
-    } else {
-        if (op == EltwiseLayer::Floor_mod) {
-            for (size_t i = 0; i < getParentEdges().size(); i++)
-                if (getParentEdgeAt(i)->getDesc().getPrecision() != Precision::I32)
-                    THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of inputs";
-            if (getChildEdgeAt(0)->getDesc().getPrecision() != Precision::I32)
-                THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of output";
-        }
-
-        if (getParentEdges().size() > 2) {
-            Precision pi = getParentEdgeAt(0)->getDesc().getPrecision();
-            Precision po = getChildEdgeAt(0)->getDesc().getPrecision();
-            for (int i = 1; i < getParentEdges().size(); i++) {
-                if (getParentEdgeAt(i)->getDesc().getPrecision() != pi)
-                    THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs must have same precision";
+bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
+    auto isOneOf = [](EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
+        for (auto a : algs) {
+            if (alg == a) {
+                return true;
             }
-            if (pi != po) {
-                THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs and output must have same precision";
-            }
-            if (pi == Precision::FP32)
-                ref_eltwise<float, float>(0, 1);
-            else if (pi == Precision::I32)
-                ref_eltwise<int32_t, int32_t>(0, 1);
-            else if (pi == Precision::I8)
-                ref_eltwise<int8_t, int8_t>(0, 1);
-            else if (pi == Precision::U8)
-                ref_eltwise<uint8_t, uint8_t>(0, 1);
-            else
-                THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, only FP32, I32, I8, U8 are supported";
-            return;
         }
+        return false;
+    };
 
-        Precision pi0 = getParentEdgeAt(0)->getDesc().getPrecision();
-        Precision pi1 = getParentEdgeAt(1)->getDesc().getPrecision();
-        Precision po = getChildEdgeAt(0)->getDesc().getPrecision();
+    if (!mayiuse(cpu::sse42))
+        return false;
 
-        IE_ASSERT(getParentEdges().size() > 1);
+    // FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number
+    size_t addedInputEdgesNum = node->getType() != Quantize ? (node->getParentEdges().size() - 1) : 0;
+    if (getParentEdges().size() + addedInputEdgesNum > MAX_ELTWISE_INPUTS)
+        return false;
 
-        if (!fusedWith.empty()) {
-            jit_eltwise_fq();
-        } else {
-            // Input and output types for eltwise compare operations can be different
-            bool is_eltwise_compare_node = (op == EltwiseLayer::Equal || op == EltwiseLayer::Not_equal ||
-                                            op == EltwiseLayer::Greater || op == EltwiseLayer::Greater_equal ||
-                                            op == EltwiseLayer::Less || op == EltwiseLayer::Less_equal);
-
-            if (po == Precision::FP32 && pi0 == po && pi1 == po) {
-                ref_eltwise<float, float>(0, 1);
-            } else if (po == Precision::FP32 && pi0 == po && pi1 == Precision::I8) {
-                ref_eltwise<float, int8_t>(0, 1);
-            } else if (po == Precision::FP32 && pi1 == po && pi0 == Precision::I8) {
-                ref_eltwise<float, int8_t>(1, 0);
-            } else if (po == Precision::FP32 && pi0 == po && pi1 == Precision::U8) {
-                ref_eltwise<float, uint8_t>(0, 1);
-            } else if (po == Precision::FP32 && pi1 == po && pi0 == Precision::U8) {
-                ref_eltwise<float, uint8_t>(1, 0);
-            } else if (po == Precision::I8 && pi0 == po && pi1 == po) {
-                ref_eltwise<int8_t, int8_t>(0, 1);
-            } else if (po == Precision::I8 && pi0 == po && pi1 == Precision::U8) {
-                ref_eltwise<int8_t, uint8_t>(0, 1);
-            } else if (po == Precision::I8 && pi1 == po && pi0 == Precision::U8) {
-                ref_eltwise<int8_t, uint8_t>(1, 0);
-            } else if (po == Precision::I32 && pi0 == po && pi1 == po) {
-                ref_eltwise<int32_t, int32_t>(0, 1);
-            } else if (po == Precision::U8 && pi0 == Precision::I32 && pi0 == pi1 && is_eltwise_compare_node) {
-                ref_eltwise2<int32_t, int32_t, uint8_t>(0, 1);
-            } else if (po == Precision::U8 && pi0 == Precision::FP32 && pi0 == pi1 && is_eltwise_compare_node) {
-                ref_eltwise2<float, float, uint8_t>(0, 1);
-            } else {
-                THROW_IE_EXCEPTION << "Eltwise node with unsupported combination of input and output types";
+    if (node->getType() == Eltwise) {
+        auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
+        if (eltwiseNode->getParentEdgesAtPort(0)[0]->getParent().get() != this) {
+            // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port.
+            if (isOneOf(eltwiseNode->getOpType(), {Subtract, Divide, FloorMod, Mod, PowerDynamic, Greater, GreaterEqual, Less, LessEqual})) {
+                return false;
             }
-        }
-    }
-}
-
-bool MKLDNNEltwiseNode::created() const {
-    return getType() == Eltwise;
-}
 
-bool MKLDNNEltwiseNode::canBeInPlace() const {
-    size_t inPlaceWithParent = getParentEdges().size();
-    for (size_t i = 0; i < inPlaceWithParent; i++) {
-        auto parentEdge = getParentEdgeAt(i);
-        if (!parentEdge->getParent()->isConstant() &&
-                parentEdge->getParent()->getChildEdges().size() == 1) {
-            inPlaceWithParent = i;
-            break;
-        }
-    }
-    // This is WA for MKLDNN implementation
-    if (inPlaceWithParent != 0)
-        return false;
-    MKLDNNDims dims = getParentEdgeAt(0)->getDims();
-    for (size_t cIdx = 0; cIdx < getChildEdges().size(); cIdx++) {
-        if (getChildEdgeAt(cIdx)->getDims() != dims) {
-            return false;
+            // Limitation: inputs precision definition inside Eltwise node assumes fusing is applied for 0-th port,
+            // otherwise we need identical precision on all inputs of fused node
+            for (int i = 1; i < eltwiseNode->getCnnLayer()->insData.size(); i++) {
+                if (eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision() != eltwiseNode->getCnnLayer()->insData[i].lock()->getPrecision()) {
+                    return false;
+                }
+            }
         }
+
+        return true;
     }
 
-    // Broadcast mode is complex for inplace usage
-    // So will disable it
-    if (broadcast) return false;
+    if (node->getType() == Quantize) {
+        auto *quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
+        if (quantizeNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
+        return !quantizeNode->isBinarization();
+    }
 
-    return true;
+    return false;
 }
+
 REG_MKLDNN_PRIM_FOR(MKLDNNEltwiseNode, Eltwise);
index 15b13c1..9b003ca 100644 (file)
@@ -8,45 +8,98 @@
 #include <mkldnn_node.h>
 #include <string>
 #include <vector>
-#include <c_types_map.hpp>
 #include <memory>
+#include <caseless.hpp>
 
 namespace MKLDNNPlugin {
 
-struct jit_eltwise_fq_params {
-    int src0_step;
-    int src1_step;
-    int dst_step;
-    mkldnn::memory::data_type src0_dt;
-    mkldnn::memory::data_type src1_dt;
-    mkldnn::memory::data_type dst_dt;
-    int src0_data_size;
-    int src1_data_size;
-    int dst_data_size;
-
-    InferenceEngine::EltwiseLayer::eOperation eltwise_op;
+#define MAX_ELTWISE_INPUTS 7
+
+enum EltwiseOpType {
+    Add = 0,
+    Multiply,
+    Subtract,
+    Divide,
+    FloorMod,
+    Mod,
+    Maximum,
+    Minimum,
+    SquaredDifference,
+    PowerDynamic,
+    PowerStatic,
+    MulAdd,
+
+    Equal,
+    NotEqual,
+    Greater,
+    GreaterEqual,
+    Less,
+    LessEqual,
+
+    LogicalAnd,
+    LogicalOr,
+    LogicalXor,
+    LogicalNot,
+
+    Relu,
+    Gelu,
+    Elu,
+    Tanh,
+    Logistic,
+    Square,
+    Abs,
+    Sqrt,
+    Linear,
+    BoundedRelu,
+    SoftRelu,
+    Relu6,
+    Exp,
+    Clamp,
+    Swish,
+    Prelu,
+    Mish,
+    Hswish,
+    Hsigmoid
 };
 
-struct jit_eltwise_fq_call_args {
-    const void *src0;
-    const void *src1;
+struct jit_eltwise_params {
+    size_t inputs_number;
+    size_t input_size;
+
+    InferenceEngine::Precision src_prc[MAX_ELTWISE_INPUTS];
+    InferenceEngine::Precision dst_prc;
+
+    std::vector<size_t> src_offsets[MAX_ELTWISE_INPUTS];
+    std::vector<size_t> dst_offsets;
+
+    size_t src_size[MAX_ELTWISE_INPUTS];
+    size_t dst_size;
+    size_t oc_size;
+};
+
+struct jit_eltwise_call_args {
+    const void *src_ptr[MAX_ELTWISE_INPUTS];
     void *dst;
+
     size_t work_amount;
+    size_t oc_off;
 };
 
-struct jit_uni_eltwise_fq_kernel {
-    void (*ker_)(const jit_eltwise_fq_call_args *);
+class MKLDNNEltwiseNode;
 
-    void operator()(const jit_eltwise_fq_call_args *args) {
+struct jit_uni_eltwise_kernel {
+    void (*ker_)(const jit_eltwise_call_args *);
+
+    void operator()(const jit_eltwise_call_args *args) {
         assert(ker_);
         ker_(args);
     }
 
-    explicit jit_uni_eltwise_fq_kernel(jit_eltwise_fq_params jep, const mkldnn_primitive_attr &attr) : ker_(nullptr), jep_(jep), attr_(attr) {}
-    virtual ~jit_uni_eltwise_fq_kernel() {}
+    explicit jit_uni_eltwise_kernel(jit_eltwise_params jep, MKLDNNEltwiseNode& node) : ker_(nullptr), jep_(jep), eltwiseNode(node) {}
+    virtual ~jit_uni_eltwise_kernel() {}
 
-    jit_eltwise_fq_params jep_;
-    const mkldnn_primitive_attr &attr_;
+    jit_eltwise_params jep_;
+    MKLDNNEltwiseNode& eltwiseNode;
 };
 
 class MKLDNNEltwiseNode : public MKLDNNNode {
@@ -56,54 +109,66 @@ public:
 
     void getSupportedDescriptors() override;
     void initSupportedPrimitiveDescriptors() override;
+    void selectOptimalPrimitiveDescriptor() override;
     void createPrimitive() override;
     void execute(mkldnn::stream strm) override;
     bool created() const override;
     bool canBeInPlace() const override;
 
     bool isSum();
-    bool isUnitScales();
     bool isWithBroadcast();
-    void initOptimalPrimitiveDescriptor() override;
+
+    bool canFuse(const MKLDNNNodePtr& node) const;
+
+    size_t getOpInputsNum() const;
+    EltwiseOpType getOpType() const { return eltwiseOp; }
+    mkldnn::algorithm getAlgorithm() const { return eltwiseAlgorithm; }
+
+    float getAlpha() const { return alpha; }
+    float getBeta() const { return beta; }
+
+    void appendPostOps(mkldnn::post_ops& ops) override;
 
 private:
-    InferenceEngine::EltwiseLayer::eOperation op;
-    std::vector<float> sum_scales;
-    bool broadcast = false;
-    int batch_dim = 5;
-    mkldnn::primitive_attr attr;
-
-    std::shared_ptr<jit_uni_eltwise_fq_kernel> eltiwse_fq_kernel;
-    jit_eltwise_fq_params jep;
-
-    void jit_eltwise_fq();
-    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
-
-    template <typename T0, typename T1> void ref_eltwise(int in0, int in1);
-    template <typename T0, typename T1, typename T2> void ref_eltwise2(int in0, int in1);
-    void dims_calc(int *dims, const MKLDNNDims &edge_dims, bool channels_first);
-    void offset_out_calc(int *offset, int *dims);
-    void offset_in_calc(int *offset, int *dims_in, int *dims_out);
-
-    template <typename T0, typename T1> void eltwise_add(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_prod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_max(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_sub(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_min(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_div(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_squared_diff(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_floor_mod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_pow(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_logical_and(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_logical_or(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1> void eltwise_logical_xor(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-
-    template <typename T0, typename T1, typename T2> void eltwise_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1, typename T2> void eltwise_not_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1, typename T2> void eltwise_less(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1, typename T2> void eltwise_less_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1, typename T2> void eltwise_greater(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
-    template <typename T0, typename T1, typename T2> void eltwise_greater_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
+    void init() override;
+
+    EltwiseOpType eltwiseOp = Add;
+    mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm_undef;
+
+    std::shared_ptr<jit_uni_eltwise_kernel> eltwise_kernel = nullptr;
+    jit_eltwise_params jep = {};
+
+    int optimalTensorRank = 6;
+    bool canUseOptimizedImpl = false;
+    bool isDynBatchEnabled = false;
+    size_t batchDimIdx = 0;
+    size_t tensorRank = 0;
+    size_t fullWorkAmount = 0;
+    size_t schedulerWorkAmount = 0;
+    std::vector<std::vector<size_t>> dims_in = {};
+    std::vector<std::vector<size_t>> offsets_in = {};
+    std::vector<size_t> dims_out = {};
+    std::vector<size_t> offsets_out = {};
+    std::vector<ptrdiff_t> start_offset_in = {};
+    ptrdiff_t start_offset_out = 0;
+    std::vector<size_t> offsets_oc = {};
+
+    float alpha = 0;
+    float beta = 0;
+    float gamma = 0;
+
+    std::vector<float> scales = {};
+    std::vector<float> shifts = {};
+
+    inline void executeOptimized6D(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr);
+    inline void executeOptimizedGeneric(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr);
+    inline void executeReference(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr);
+
+    void offset_out_calc(std::vector<size_t>& offset, std::vector<size_t>& dims);
+    void offset_in_calc(std::vector<size_t>& offset, std::vector<size_t>& dims_in, std::vector<size_t>& dims_out);
+
+    static InferenceEngine::details::caseless_map<std::string,
+        std::function<void(InferenceEngine::GenericLayer*, EltwiseOpType&, mkldnn::algorithm&, float&, float&)>> initializers;
 };
 
 }  // namespace MKLDNNPlugin
index 8d0b13d..bcb97ca 100644 (file)
@@ -3,8 +3,7 @@
 //
 
 #include "mkldnn_fullyconnected_node.h"
-#include "mkldnn_activation_node.h"
-#include "mkldnn_depthwise_node.h"
+#include "mkldnn_eltwise_node.h"
 #include "mkldnn_quantize_node.h"
 #include "desc_iterator.hpp"
 #include <legacy/ie_layers.h>
@@ -199,10 +198,10 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
             continue;
         }
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode && (eltwiseNode->getOpType() == MulAdd || eltwiseNode->getOpType() == Prelu)) {
             if (initWeights) {
-                auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
+                auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(eltwiseNode->getCnnLayer().get());
                 int ndims = getParentEdgeAt(0)->getDims().ndims();
                 MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(ndims == 3 ? getChildEdgeAt(0)->getDims()[2] : getChildEdgeAt(0)->getDims()[1], 16))});
 
@@ -211,7 +210,7 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
                 PostOpsIntBlobMemory[blob_idx]->FillZero();
 
                 // In case ndims == 3 graph optimizer allows fusing only if all weights values are the same
-                if (depthwiseNode->isBroadcast() || ndims == 3) {
+                if (depthwiseLayer->blobs["weights"]->size() == 1 || ndims == 3) {
                     float broadcastValue = static_cast<float *>(depthwiseLayer->_weights->buffer())[0];
                     for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
                         static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
@@ -223,13 +222,13 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
                                                             MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
                 }
 
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
+                if (eltwiseNode->getAlgorithm() == depthwise_scale_shift) {
                     PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
                     PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
                     PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
 
                     // In case ndims == 3 graph optimizer allows fusing only if all biases values are the same
-                    if (depthwiseNode->isBroadcast() || ndims == 3) {
+                    if (depthwiseLayer->blobs["biases"]->size() == 1 || ndims == 3) {
                         float broadcastValue = static_cast<float *>(depthwiseLayer->_biases->buffer())[0];
                         for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
                             static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
@@ -241,20 +240,20 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
                                                                     MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
                     }
 
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
+                    ops.append_depthwise(eltwiseNode->getAlgorithm(),
                                          (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
                                          (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
 
                     blob_idx += 2;
                 } else {
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
+                    ops.append_depthwise(eltwiseNode->getAlgorithm(),
                                          (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
                                          nullptr);
 
                     blob_idx += 1;
                 }
             } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
+                ops.append_depthwise(eltwiseNode->getAlgorithm(),
                                      nullptr,
                                      nullptr);
             }
@@ -262,11 +261,8 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
             continue;
         }
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
-            continue;
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
         }
     }
 
index 5518956..de76d7d 100644 (file)
@@ -5,9 +5,8 @@
 #include "mkldnn_interpolate_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
 #include <legacy/ie_layers.h>
+#include "mkldnn_eltwise_node.h"
 #include <mkldnn.hpp>
 #include <string>
 #include <vector>
@@ -1480,62 +1479,9 @@ void MKLDNNInterpolateNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
             continue;
         }
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
-            if (initWeights) {
-                auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))});
-
-                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                        depthwiseLayer->_weights->buffer(),
-                                                        depthwiseLayer->_weights->size() *
-                                                        MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                if (depthwiseNode->isBroadcast()) {
-                    float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
-                    for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
-                    }
-                }
-
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
-                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
-                                                               memory::format::x);
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
-                                                                depthwiseLayer->_biases->buffer(),
-                                                                depthwiseLayer->_biases->size() *
-                                                                MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                    if (depthwiseNode->isBroadcast()) {
-                        float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
-                        for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
-                        }
-                    }
-
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
-                    blob_idx += 2;
-                }
-            } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                     nullptr,
-                                     nullptr);
-            }
-
-            continue;
-        }
-
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
             continue;
         }
 
@@ -2153,7 +2099,7 @@ inline int MKLDNNInterpolateNode::nearestRound(float originCoord, bool isDownsam
 }
 
 bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
-    auto isOneOf = [](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+    auto isOneOf = [&](EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
         for (auto a : algs) {
             if (alg == a) {
                 return true;
@@ -2170,22 +2116,16 @@ bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
     if (node->getType() == Quantize) {
         auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
         if (quantizeNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
+            THROW_IE_EXCEPTION << "Cannot get quantize node " << node->getName();
         return !quantizeNode->isBinarization();
-    } else if (node->getType() == Depthwise) {
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
-        if (depthwiseNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
-        return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
-                (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
-    } else if (node->getType() == Activation) {
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
-        if (activationNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
-        return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic,
-            eltwise_bounded_relu, eltwise_clamp, eltwise_tanh, eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid,
-            eltwise_linear, eltwise_abs, eltwise_square, eltwise_sqrt});
+    } else if (node->getType() == Eltwise) {
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
+        if (eltwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
+        return isOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu, Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp,
+                                                  Tanh, Swish, Hswish, Mish, Hsigmoid, Linear, Abs, Square, Sqrt});
     }
+
     return false;
 }
 
index a5199a8..625a5b2 100644 (file)
@@ -5,9 +5,8 @@
 #include "mkldnn_mvn_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
 #include <legacy/ie_layers.h>
+#include "mkldnn_eltwise_node.h"
 #include <mkldnn.hpp>
 #include <string>
 #include <vector>
@@ -597,64 +596,9 @@ void MKLDNNMVNNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {
             continue;
         }
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
-            if (initWeights) {
-                auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))});
-
-                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-                PostOpsIntBlobMemory[blob_idx]->FillZero();
-
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                        depthwiseLayer->_weights->buffer(),
-                                                        depthwiseLayer->_weights->size() *
-                                                        MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                if (depthwiseNode->isBroadcast()) {
-                    float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
-                    for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
-                    }
-                }
-
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
-                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
-                                                               memory::format::x);
-                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
-                                                                depthwiseLayer->_biases->buffer(),
-                                                                depthwiseLayer->_biases->size() *
-                                                                MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                    if (depthwiseNode->isBroadcast()) {
-                        float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
-                        for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
-                        }
-                    }
-
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
-                    blob_idx += 2;
-                }
-            } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                     nullptr,
-                                     nullptr);
-            }
-
-            continue;
-        }
-
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
             continue;
         }
 
index fd59bb9..72f7570 100644 (file)
@@ -3,8 +3,7 @@
 //
 
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
+#include "mkldnn_eltwise_node.h"
 #include <mkldnn_extension_utils.h>
 #include <legacy/ie_layers_internal.hpp>
 #include "ie_parallel.hpp"
@@ -808,70 +807,9 @@ void MKLDNNNormalizeNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeig
             continue;
         }
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
-            if (initWeights) {
-                auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getParentEdgeAt(0)->getDims()[1], 16))});
-
-                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-                PostOpsIntBlobMemory[blob_idx]->FillZero();
-
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                        depthwiseLayer->_weights->buffer(),
-                                                        depthwiseLayer->_weights->size() *
-                                                        MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                if (depthwiseNode->isBroadcast()) {
-                    float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
-                    for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
-                    }
-                }
-
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
-                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
-                                                               memory::format::x);
-                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
-                                                                depthwiseLayer->_biases->buffer(),
-                                                                depthwiseLayer->_biases->size() *
-                                                                MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                    if (depthwiseNode->isBroadcast()) {
-                        float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
-                        for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
-                        }
-                    }
-
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
-                    blob_idx += 2;
-                } else {
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         nullptr);
-
-                    blob_idx += 1;
-                }
-            } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                     nullptr,
-                                     nullptr);
-            }
-
-            continue;
-        }
-
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
             continue;
         }
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
deleted file mode 100644 (file)
index c2885b6..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_power_node.h"
-#include <legacy/ie_layers.h>
-#include <string>
-#include <cmath>
-#include <mkldnn_types.h>
-#include <mkldnn_extension_utils.h>
-#include <limits>
-#include "ie_parallel.hpp"
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-
-MKLDNNPowerNode::MKLDNNPowerNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
-        : MKLDNNNode(layer, eng, cache), scale(1.0f), shift(1.0f), power(1.0f) {}
-
-void MKLDNNPowerNode::getSupportedDescriptors() {
-    auto * powerLayer = dynamic_cast<PowerLayer*>(getCnnLayer().get());
-
-    if (powerLayer == nullptr)
-        THROW_IE_EXCEPTION << "Cannot convert power layer.";
-    scale = powerLayer->scale;
-    power = powerLayer->power;
-    shift = powerLayer->offset;
-
-    if (getParentEdges().size() != 1)
-        THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
-    if (getChildEdges().empty())
-        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
-}
-
-void MKLDNNPowerNode::initSupportedPrimitiveDescriptors() {
-    if (!supportedPrimitiveDescriptors.empty())
-        return;
-
-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-    precision = getCnnLayer()->outData[0]->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
-    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
-    InferenceEngine::LayerConfig config;
-    config.dynBatchSupport = true;
-    config.inConfs.resize(1);
-    config.outConfs.resize(1);
-    config.inConfs[0].inPlace = -1;
-    config.inConfs[0].constant = false;
-    config.outConfs[0].inPlace = -1;
-    config.outConfs[0].constant = false;
-    for (auto format : getAvailableFormatsForDims(getParentEdgeAt(0)->getDims())) {
-        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
-        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, format);
-        if (format != memory::any) {
-            config.inConfs[0].desc = InferenceEngine::TensorDesc(config.inConfs[0].desc.getPrecision(),
-                                                                 config.inConfs[0].desc.getDims(), {
-                                                                         config.inConfs[0].desc.getBlockingDesc().getBlockDims(),
-                                                                         config.inConfs[0].desc.getBlockingDesc().getOrder(),
-                                                                         (std::numeric_limits<size_t>::max)()
-                                                                 });
-            config.outConfs[0].desc = InferenceEngine::TensorDesc(config.outConfs[0].desc.getPrecision(),
-                                                                  config.outConfs[0].desc.getDims(), {
-                                                                          config.outConfs[0].desc.getBlockingDesc().getBlockDims(),
-                                                                          config.outConfs[0].desc.getBlockingDesc().getOrder(),
-                                                                          (std::numeric_limits<size_t>::max)()
-                                                                  });
-        }
-        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, format);
-    }
-}
-
-void MKLDNNPowerNode::createPrimitive() {
-    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-    auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
-    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
-        THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
-    if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
-        THROW_IE_EXCEPTION << "Input memory didn't allocate.";
-    if (getSelectedPrimitiveDescriptor() == nullptr)
-        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
-}
-
-void MKLDNNPowerNode::execute(mkldnn::stream strm) {
-    auto& srcMemory = getParentEdgeAt(0)->getMemory();
-    auto& dstMemory = getChildEdgeAt(0)->getMemory();
-    const size_t data_size = srcMemory.GetSize() / sizeof(float) / srcMemory.GetDims()[0] * batchToProcess();
-
-    const auto *src_ptr = reinterpret_cast<const float*>(srcMemory.GetData()) +
-            srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    float *dst_ptr = reinterpret_cast<float*>(dstMemory.GetData()) +
-            dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
-
-    if (power == -1.f) {
-        parallel_for(data_size, [&](size_t i) {
-            float val = src_ptr[i] * scale + shift;
-            dst_ptr[i] = 1 / val;
-        });
-    } else if (power == 0.5f) {
-        parallel_for(data_size, [&](size_t i) {
-            float val = src_ptr[i] * scale + shift;
-            dst_ptr[i] = sqrtf(val);
-        });
-    } else if (power == 1.0f) {
-        parallel_for(data_size, [&](size_t i) {
-            dst_ptr[i] = src_ptr[i] * scale + shift;
-        });
-    } else if (power == 2.0f) {
-        parallel_for(data_size, [&](size_t i) {
-            float val = src_ptr[i] * scale + shift;
-            dst_ptr[i] = val * val;
-        });
-    } else if (power == 3.0f) {
-        parallel_for(data_size, [&](size_t i) {
-            float val = src_ptr[i] * scale + shift;
-            dst_ptr[i] = val * val * val;
-        });
-    } else {
-        parallel_for(data_size, [&](size_t i) {
-            dst_ptr[i] = pow(src_ptr[i] * scale + shift, power);
-        });
-    }
-}
-
-bool MKLDNNPowerNode::created() const {
-    return getType() == Power;
-}
-REG_MKLDNN_PRIM_FOR(MKLDNNPowerNode, Power);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
deleted file mode 100644 (file)
index 71103b9..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include <string>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNPowerNode : public MKLDNNNode {
-public:
-    MKLDNNPowerNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
-    ~MKLDNNPowerNode() override = default;
-
-    void getSupportedDescriptors() override;
-    void initSupportedPrimitiveDescriptors() override;
-    void createPrimitive() override;
-    void execute(mkldnn::stream strm) override;
-    bool created() const override;
-
-private:
-    float scale;
-    float shift;
-    float power;
-};
-
-}  // namespace MKLDNNPlugin
-
index c313107..5331dc2 100644 (file)
@@ -43,10 +43,6 @@ void MKLDNNQuantizeNode::init() {
             THROW_IE_EXCEPTION << "Quantize layer " << getName() << " has unsupported number of parent edges at port " << i;
     }
 
-    if (getParentEdgesAtPort(0)[0]->getDims().ndims() < 1ul || getParentEdgesAtPort(0)[0]->getDims().ndims() > 5ul) {
-        THROW_IE_EXCEPTION << "Unsupported number of dimensions for input at edge 0 in Quantize layer " << getName();
-    }
-
     auto initAxisIdx = [&](size_t edgeIdx) {
         auto edge = getParentEdgesAtPort(edgeIdx)[0];
 
@@ -319,6 +315,10 @@ std::vector<mkldnn::memory::format> MKLDNNQuantizeNode::getDataFormats() const {
 }
 
 void MKLDNNQuantizeNode::getSupportedDescriptors() {
+    if (getParentEdgesAtPort(0)[0]->getDims().ndims() < 1ul || getParentEdgesAtPort(0)[0]->getDims().ndims() > 5ul) {
+        THROW_IE_EXCEPTION << "Unsupported number of dimensions for input at edge 0 in Quantize layer " << getName();
+    }
+
     mkldnn::memory::data_type idt = MKLDNNExtensionUtils::IEPrecisionToDataType(getInputPrecision());
     mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
     mkldnn::memory::data_type ddt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOutputPrecision());
index c2f941a..afe532d 100644 (file)
@@ -5,12 +5,11 @@
 #include "mkldnn_reduce_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
 #include <legacy/ie_layers.h>
 #include <mkldnn.hpp>
 #include <string>
 #include <vector>
+#include <set>
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
 #include "ie_parallel.hpp"
index b63fc70..035b452 100644 (file)
@@ -5,9 +5,8 @@
 #include "mkldnn_resample_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
 #include <legacy/ie_layers.h>
+#include "mkldnn_eltwise_node.h"
 #include <mkldnn.hpp>
 #include <string>
 #include <vector>
@@ -438,64 +437,9 @@ void MKLDNNResampleNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeigh
             continue;
         }
 
-        auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
-        if (depthwiseNode) {
-            if (initWeights) {
-                auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))});
-
-                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-                PostOpsIntBlobMemory[blob_idx]->FillZero();
-
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                        depthwiseLayer->_weights->buffer(),
-                                                        depthwiseLayer->_weights->size() *
-                                                        MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                if (depthwiseNode->isBroadcast()) {
-                    float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
-                    for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
-                    }
-                }
-
-                if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
-                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
-                                                               memory::format::x);
-                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
-                                                                depthwiseLayer->_biases->buffer(),
-                                                                depthwiseLayer->_biases->size() *
-                                                                MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
-                    if (depthwiseNode->isBroadcast()) {
-                        float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
-                        for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
-                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
-                        }
-                    }
-
-                    ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                         (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
-                    blob_idx += 2;
-                }
-            } else {
-                ops.append_depthwise(depthwiseNode->getAlgorithm(),
-                                     nullptr,
-                                     nullptr);
-            }
-
-            continue;
-        }
-
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
-        if (activationNode) {
-            ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        if (eltwiseNode) {
+            eltwiseNode->appendPostOps(ops);
             continue;
         }
 
index a740111..fe34f81 100644 (file)
@@ -5,8 +5,6 @@
 #include "mkldnn_scatter_update_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
 #include <legacy/ie_layers.h>
 #include <mkldnn.hpp>
 #include <string>
index 185f9ba..9b20c89 100644 (file)
@@ -39,9 +39,14 @@ std::vector<CommonTestUtils::OpType> opTypes = {
 };
 
 std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypes = {
+        ngraph::helpers::EltwiseTypes::ADD,
         ngraph::helpers::EltwiseTypes::MULTIPLY,
         ngraph::helpers::EltwiseTypes::SUBTRACT,
-        ngraph::helpers::EltwiseTypes::ADD
+        ngraph::helpers::EltwiseTypes::DIVIDE,
+        ngraph::helpers::EltwiseTypes::FLOOR_MOD,
+        ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
+        ngraph::helpers::EltwiseTypes::POWER,
+        ngraph::helpers::EltwiseTypes::MOD
 };
 
 std::map<std::string, std::string> additional_config = {};
index 53d314c..033b20a 100644 (file)
@@ -22,7 +22,6 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*(QuantGroupConv3D).*)",
         // TODO: Issue 31845
         R"(.*(FakeQuantizeLayerTest).*)",
-        R"(.*(EltwiseLayerTest).*IS=\(.*\..*\..*\..*\..*\).*secondaryInputType=PARAMETER.*opType=SCALAR.*)",
         // TODO: failed to downgrade to opset v0 in interpreter backend
         R"(.*Gather.*axis=-1.*)",
         // TODO: Issue 33151
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp
new file mode 100644 (file)
index 0000000..7d371b4
--- /dev/null
@@ -0,0 +1,327 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <single_layer_tests/eltwise.hpp>
+#include <ngraph_functions/builders.hpp>
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+        LayerTestsDefinitions::EltwiseTestParams,
+        CPUSpecificParams> EltwiseLayerCPUTestParamsSet;
+
+class EltwiseLayerCPUTest : public testing::WithParamInterface<EltwiseLayerCPUTestParamsSet>,
+                                     virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<EltwiseLayerCPUTestParamsSet> obj) {
+        LayerTestsDefinitions::EltwiseTestParams basicParamsSet;
+        CPUSpecificParams cpuParams;
+        std::tie(basicParamsSet, cpuParams) = obj.param;
+
+        std::ostringstream result;
+        result << LayerTestsDefinitions::EltwiseLayerTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::EltwiseTestParams>(
+                basicParamsSet, 0));
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() {
+        LayerTestsDefinitions::EltwiseTestParams basicParamsSet;
+        CPUSpecificParams cpuParams;
+        std::tie(basicParamsSet, cpuParams) = this->GetParam();
+
+        std::vector<std::vector<size_t>> inputShapes;
+        InferenceEngine::Precision netPrecision;
+        ngraph::helpers::InputLayerType secondaryInputType;
+        CommonTestUtils::OpType opType;
+        ngraph::helpers::EltwiseTypes eltwiseType;
+        std::map<std::string, std::string> additional_config;
+        std::tie(inputShapes, eltwiseType, secondaryInputType, opType, netPrecision, inPrc, outPrc, inLayout, targetDevice, additional_config) = basicParamsSet;
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        std::string isaType;
+        if (with_cpu_x86_avx512f()) {
+            isaType = "jit_avx512";
+        } else if (with_cpu_x86_avx2()) {
+            isaType = "jit_avx2";
+        } else if (with_cpu_x86_sse42()) {
+            isaType = "jit_sse42";
+        } else {
+            isaType = "ref";
+        }
+        selectedType = isaType + "_" + "FP32";
+
+        std::vector<size_t> inputShape1, inputShape2;
+        if (inputShapes.size() == 1) {
+            inputShape1 = inputShape2 = inputShapes.front();
+        } else if (inputShapes.size() == 2) {
+            inputShape1 = inputShapes.front();
+            inputShape2 = inputShapes.back();
+        } else {
+            THROW_IE_EXCEPTION << "Incorrect number of input shapes";
+        }
+
+        configuration.insert(additional_config.begin(), additional_config.end());
+        auto input = ngraph::builder::makeParams(ngPrc, {inputShape1});
+
+        std::vector<size_t> shape_input_secondary;
+        switch (opType) {
+            case CommonTestUtils::OpType::SCALAR: {
+                shape_input_secondary = std::vector<size_t>({1});
+                break;
+            }
+            case CommonTestUtils::OpType::VECTOR:
+                shape_input_secondary = inputShape2;
+                break;
+            default:
+                FAIL() << "Unsupported Secondary operation type";
+        }
+
+        std::shared_ptr<ngraph::Node> secondaryInput;
+        if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE ||
+            eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD ||
+            eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
+            std::vector<float> data(ngraph::shape_size(shape_input_secondary));
+            data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary));
+            for (float &i : data) {
+                if (i == 0) {
+                    i = 1;
+                }
+            }
+            secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data);
+        } else {
+            secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
+            if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
+                input.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondaryInput));
+            }
+        }
+
+        auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
+        eltwise->get_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority);
+        function = std::make_shared<ngraph::Function>(eltwise, input, "Eltwise");
+    }
+};
+
+TEST_P(EltwiseLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+    CheckCPUImpl(executableNetwork, "Eltwise", inFmts, outFmts, selectedType);
+}
+
+namespace {
+
+std::vector<ngraph::helpers::InputLayerType> secondaryInputTypes = {
+        ngraph::helpers::InputLayerType::CONSTANT,
+        ngraph::helpers::InputLayerType::PARAMETER,
+};
+
+std::vector<CommonTestUtils::OpType> opTypes = {
+        CommonTestUtils::OpType::VECTOR,
+};
+
+std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypes = {
+        ngraph::helpers::EltwiseTypes::ADD,
+        ngraph::helpers::EltwiseTypes::MULTIPLY,
+        // TODO: Disabled because memory formats filter is not propogated through ngraph transformations
+//        ngraph::helpers::EltwiseTypes::SUBTRACT,
+//        ngraph::helpers::EltwiseTypes::DIVIDE,
+        ngraph::helpers::EltwiseTypes::FLOOR_MOD,
+        ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
+};
+
+std::map<std::string, std::string> additional_config = {};
+
+std::vector<CPUSpecificParams> filterCPUSpecificParams(std::vector<CPUSpecificParams>& paramsVector) {
+    auto adjustBlockedFormatByIsa = [](std::vector<cpu_memory_format_t>& formats) {
+        for (int i = 0; i < formats.size(); i++) {
+            if (formats[i] == nChw16c)
+                formats[i] = nChw8c;
+            if (formats[i] == nCdhw16c)
+                formats[i] = nCdhw8c;
+        }
+    };
+
+    if (!with_cpu_x86_avx512f()) {
+        for (auto& param : paramsVector) {
+            adjustBlockedFormatByIsa(std::get<0>(param));
+            adjustBlockedFormatByIsa(std::get<1>(param));
+        }
+    }
+
+    return paramsVector;
+}
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D = {
+        {{2, 4, 4, 1}},
+        {{2, 17, 5, 4}},
+        {{2, 17, 5, 4}, {1, 17, 1, 1}},
+        {{2, 17, 5, 1}, {1, 17, 1, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D = {
+        CPUSpecificParams({nChw16c, nChw16c}, {nChw16c}, {}, {}),
+        CPUSpecificParams({nhwc, nhwc}, {nhwc}, {}, {}),
+        CPUSpecificParams({nchw, nchw}, {nchw}, {}, {})
+};
+
+const auto params_4D_FP32 = ::testing::Combine(
+        ::testing::Combine(
+            ::testing::ValuesIn(inShapes_4D),
+            ::testing::ValuesIn(eltwiseOpTypes),
+            ::testing::ValuesIn(secondaryInputTypes),
+            ::testing::ValuesIn(opTypes),
+            ::testing::Values(InferenceEngine::Precision::FP32),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Layout::ANY),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU),
+            ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D = {
+        {{2, 4, 3, 4, 1}},
+        {{2, 17, 7, 5, 4}},
+        {{2, 17, 6, 5, 4}, {1, 17, 6, 1, 1}},
+        {{2, 17, 6, 5, 1}, {1, 17, 1, 1, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D = {
+        CPUSpecificParams({nCdhw16c, nCdhw16c}, {nCdhw16c}, {}, {}),
+        CPUSpecificParams({ndhwc, ndhwc}, {ndhwc}, {}, {}),
+        CPUSpecificParams({ncdhw, ncdhw}, {ncdhw}, {}, {})
+};
+
+const auto params_5D_FP32 = ::testing::Combine(
+        ::testing::Combine(
+            ::testing::ValuesIn(inShapes_5D),
+            ::testing::ValuesIn(eltwiseOpTypes),
+            ::testing::ValuesIn(secondaryInputTypes),
+            ::testing::ValuesIn(opTypes),
+            ::testing::Values(InferenceEngine::Precision::FP32),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Layout::ANY),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU),
+            ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D_Blocked_Planar = {
+        {{2, 17, 31, 3}, {2, 1, 31, 3}},
+        {{2, 17, 5, 1}, {2, 1, 1, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D_Blocked_Planar = {
+        CPUSpecificParams({nChw16c, nchw}, {nChw16c}, {}, {}),
+};
+
+const auto params_4D_FP32_Blocked_Planar = ::testing::Combine(
+        ::testing::Combine(
+            ::testing::ValuesIn(inShapes_4D_Blocked_Planar),
+            ::testing::ValuesIn(eltwiseOpTypes),
+            ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+            ::testing::ValuesIn(opTypes),
+            ::testing::Values(InferenceEngine::Precision::FP32),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Layout::ANY),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU),
+            ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Planar)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_Blocked_Planar, EltwiseLayerCPUTest, params_4D_FP32_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D_Planar_Blocked = {
+        {{2, 1, 31, 3}, {2, 17, 31, 3}},
+        {{2, 1, 1, 4}, {2, 17, 5, 1}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D_Planar_Blocked = {
+        CPUSpecificParams({nchw, nChw16c}, {nChw16c}, {}, {}),
+};
+
+const auto params_4D_FP32_Planar_Blocked = ::testing::Combine(
+        ::testing::Combine(
+            ::testing::ValuesIn(inShapes_4D_Planar_Blocked),
+            ::testing::ValuesIn(eltwiseOpTypes),
+            ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+            ::testing::ValuesIn(opTypes),
+            ::testing::Values(InferenceEngine::Precision::FP32),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Layout::ANY),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU),
+            ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Planar_Blocked)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_Planar_Blocked, EltwiseLayerCPUTest, params_4D_FP32_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D_Blocked_Planar = {
+        {{2, 17, 31, 4, 3}, {2, 1, 31, 1, 3}},
+        {{2, 17, 5, 3, 1}, {2, 1, 1, 3, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D_Blocked_Planar = {
+        CPUSpecificParams({nCdhw16c, ncdhw}, {nCdhw16c}, {}, {}),
+};
+
+const auto params_5D_FP32_Blocked_Planar = ::testing::Combine(
+        ::testing::Combine(
+            ::testing::ValuesIn(inShapes_5D_Blocked_Planar),
+            ::testing::ValuesIn(eltwiseOpTypes),
+            ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+            ::testing::ValuesIn(opTypes),
+            ::testing::Values(InferenceEngine::Precision::FP32),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+            ::testing::Values(InferenceEngine::Layout::ANY),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU),
+            ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Blocked_Planar)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_Blocked_Planar, EltwiseLayerCPUTest, params_5D_FP32_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D_Planar_Blocked = {
+        {{2, 1, 31, 1, 3}, {2, 17, 31, 4, 3}},
+        {{2, 1, 1, 3, 4}, {2, 17, 5, 3, 1}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D_Planar_Blocked = {
+        CPUSpecificParams({ncdhw, nCdhw16c}, {nCdhw16c}, {}, {}),
+};
+
+const auto params_5D_FP32_Planar_Blocked = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::ValuesIn(inShapes_5D_Planar_Blocked),
+                ::testing::ValuesIn(eltwiseOpTypes),
+                ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+                ::testing::ValuesIn(opTypes),
+                ::testing::Values(InferenceEngine::Precision::FP32),
+                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                ::testing::Values(InferenceEngine::Layout::ANY),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Planar_Blocked)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_Planar_Blocked, EltwiseLayerCPUTest, params_5D_FP32_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp
new file mode 100644 (file)
index 0000000..fad9068
--- /dev/null
@@ -0,0 +1,184 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include <debug.h>
+#include <functional_test_utils/layer_test_utils.hpp>
+#include <ngraph_functions/builders.hpp>
+#include <ie_precision.hpp>
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/precision_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+#include "ie_system_conf.h"
+
+using namespace CPUTestUtils;
+using InferenceEngine::Precision;
+using ngraph::helpers::EltwiseTypes;
+using FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+        std::vector<std::vector<size_t>>,        // Input shapes
+        std::vector<InferenceEngine::Precision>, // Input precisions
+        std::vector<EltwiseTypes>,               // Eltwise operations
+        bool,                                    // With quantization
+        std::string                              // Device name
+> EltwiseChainTuple;
+
+class EltwiseChainTest : public testing::WithParamInterface<EltwiseChainTuple>,
+                         virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<EltwiseChainTuple> &obj) {
+        std::vector<std::vector<size_t>> inputShapes;
+        std::vector<InferenceEngine::Precision> inputPrecisions;
+        std::vector<EltwiseTypes> eltwiseOpTypes;
+        bool withQuantization;
+        std::string targetName;
+        std::tie(inputShapes, inputPrecisions, eltwiseOpTypes, withQuantization, targetName) = obj.param;
+        std::ostringstream results;
+
+        for (int i = 0; i < inputShapes.size(); i++) {
+            results << "IS" << std::to_string(i) << "=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+        }
+        for (int i = 0; i < inputPrecisions.size(); i++) {
+            results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i].name() << "_";
+        }
+        for (int i = 0; i < eltwiseOpTypes.size(); i++) {
+            results << "Op" << std::to_string(i) << "=" << eltwiseOpTypes[i] << "_";
+        }
+
+        results << "WithQuant=" << withQuantization << "_";
+        results << "targetDevice=" << targetName;
+
+        return results.str();
+    }
+
+protected:
+    void SetUp() {
+        threshold = 0.1f;
+
+        std::vector<std::vector<size_t>> inputShapes;
+        std::vector<InferenceEngine::Precision> inputPrecisions;
+        std::vector<EltwiseTypes> eltwiseOpTypes;
+        bool withQuantization;
+        std::tie(inputShapes, inputPrecisions, eltwiseOpTypes, withQuantization, targetDevice) = this->GetParam();
+
+        auto ngraphParam = ngraph::builder::makeParams(convertIE2nGraphPrc(inputPrecisions[0]), {inputShapes[0]});
+
+        std::vector<std::shared_ptr<ngraph::Node>> ngraphInputs;
+        for (int i = 1; i < inputPrecisions.size(); i++) {
+            std::vector<float> ngraphInput1Data(ngraph::shape_size(ngraph::Shape{inputShapes[i]}));
+            ngraphInputs.push_back(ngraph::builder::makeConstant(convertIE2nGraphPrc(inputPrecisions[i]), ngraph::Shape{inputShapes[i]},
+                                                                 ngraphInput1Data, true));
+        }
+
+        if (withQuantization) {
+            std::vector<std::shared_ptr<ngraph::Node>> eltwiseOps;
+            eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0]));
+            for (int i = 1; i < eltwiseOpTypes.size() - 1; i++) {
+                eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i]));
+            }
+
+            std::vector<size_t> constShape(inputShapes[0].size(), 1);
+            constShape[1] = inputShapes[0][1];
+            auto fq = ngraph::builder::makeFakeQuantize(eltwiseOps[eltwiseOps.size() - 1],
+                                                        ::ngraph::element::Type(::ngraph::element::Type_t::f32),
+                                                        256, constShape);
+
+            eltwiseOps.push_back(ngraph::builder::makeEltwise(fq, ngraphInputs[eltwiseOpTypes.size() - 1], eltwiseOpTypes[eltwiseOpTypes.size() - 1]));
+
+            ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(eltwiseOps[eltwiseOps.size() - 1])};
+            function = std::make_shared<ngraph::Function>(results, ngraphParam, "eltwise_chain_fq");
+        } else {
+            std::vector<std::shared_ptr<ngraph::Node>> eltwiseOps;
+            eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0]));
+            for (int i = 1; i < eltwiseOpTypes.size(); i++) {
+                eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i]));
+            }
+
+            ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(eltwiseOps[eltwiseOps.size() - 1])};
+            function = std::make_shared<ngraph::Function>(results, ngraphParam, "eltwise_chain");
+        }
+    }
+};
+
+TEST_P(EltwiseChainTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+}
+
+namespace {
+
+std::vector<std::vector<std::vector<size_t>>> inputShapes {
+        {
+            {{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}},
+            {{1, 48, 5, 6}, {1, 48, 1, 1}, {1, 48, 5, 6}, {1, 1, 5, 6}},
+            {{1, 72, 28, 28}, {1, 72, 1, 1}, {1, 72, 1, 1}, {1, 72, 1, 1}},
+            {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}},
+            {{1, 2, 3}, {3}, {3}, {3}},
+            {{1, 12, 5, 5}, {5, 5}, {12, 5, 5}, {1}},
+            {{3, 12, 5, 5}, {1, 12, 5, 1}, {3, 1, 1, 1}, {3, 12, 5, 5}},
+            {{1, 1, 1, 1}, {1, 12, 5, 1}, {3, 12, 1, 5}, {3, 12, 5, 1}},
+            {{1, 1, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}
+        }
+};
+
+std::vector<std::vector<InferenceEngine::Precision>> inputPrecisions = {
+        { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 },
+        { Precision::I32, Precision::I32, Precision::I32, Precision::I32 }
+};
+
+std::vector<std::vector<EltwiseTypes>> eltwiseOps = {
+        { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT },
+        { EltwiseTypes::DIVIDE, EltwiseTypes::POWER, EltwiseTypes::ADD },
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseChain, EltwiseChainTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inputShapes),
+                                ::testing::ValuesIn(inputPrecisions),
+                                ::testing::ValuesIn(eltwiseOps),
+                                ::testing::Values(false),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        EltwiseChainTest::getTestCaseName);
+
+std::vector<std::vector<std::vector<size_t>>> inputShapesFQ {
+        {
+            {{1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}},
+            {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}},
+            {{2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}},
+            {{2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}},
+            {{2, 5, 7, 5}, {2, 5, 1, 5}, {2, 5, 7, 5}, {2, 5, 7, 5}},
+            {{2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}},
+            {{2, 256, 7, 5}, {2, 256, 7, 5}, {2, 256, 1, 5}, {2, 256, 7, 5}},
+            {{1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}},
+            {{1, 12, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}},
+            {{1, 12, 1, 1, 6}, {1, 12, 5, 5, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 5, 1}},
+            {{1, 12, 1, 1, 1}, {1, 12, 5, 1, 7}, {3, 12, 1, 5, 7}, {3, 12, 5, 1, 7}},
+            {{1, 7, 1, 1, 12}, {1, 7, 5, 1, 12}, {3, 7, 1, 5, 12}, {3, 7, 5, 1, 12}},
+            {{1, 7, 1, 1, 12, 3, 7}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 3, 7}, {3, 7, 5, 1, 12, 3, 7}},
+            {{1, 7, 1, 1, 12, 3, 1}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 1, 7}, {3, 7, 5, 1, 12, 3, 1}}
+        }
+};
+
+std::vector<std::vector<InferenceEngine::Precision>> inputPrecisionsFQ {
+        { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 }
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseChainWithFQ, EltwiseChainTest,
+                    ::testing::Combine(
+                            ::testing::ValuesIn(inputShapesFQ),
+                            ::testing::ValuesIn(inputPrecisionsFQ),
+                            ::testing::ValuesIn(eltwiseOps),
+                            ::testing::Values(true),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        EltwiseChainTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
index dec5e1e..22bf1a5 100644 (file)
@@ -93,10 +93,25 @@ void EltwiseLayerTest::SetUp() {
             FAIL() << "Unsupported Secondary operation type";
     }
 
-    auto secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
-    if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
-        input.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondaryInput));
+    std::shared_ptr<ngraph::Node> secondaryInput;
+    if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE ||
+        eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD ||
+        eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
+        std::vector<float> data(ngraph::shape_size(shape_input_secondary));
+        data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary));
+        for (float &i : data) {
+            if (i == 0) {
+                i = 1;
+            }
+        }
+        secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data);
+    } else {
+        secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
+        if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
+            input.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondaryInput));
+        }
     }
+
     auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
     function = std::make_shared<ngraph::Function>(eltwise, input, "Eltwise");
 }
index deddabb..4d05cef 100644 (file)
@@ -564,6 +564,9 @@ std::ostream& operator<<(std::ostream & os, ngraph::helpers::EltwiseTypes type)
         case ngraph::helpers::EltwiseTypes::FLOOR_MOD:
             os << "FloorMod";
             break;
+        case ngraph::helpers::EltwiseTypes::MOD:
+            os << "Mod";
+            break;
         default:
             throw std::runtime_error("NOT_SUPPORTED_OP_TYPE");
     }
index e17f2ab..d96d449 100644 (file)
@@ -261,655 +261,6 @@ std::string select_op(eltwise_test_params::opType op) {
     return str_op;
 }
 
-class MKLDNNGraphEltwise3InputsTests: public TestsCommon,
-                                     public WithParamInterface<eltwise_test_params> {
-    std::string model_t = R"V0G0N(
-<net name="EltwiseOnly" version="3" precision="FP32" batch="1">
-    <layers>
-        <layer name="in1" type="Input" precision="FP32" id="1">
-            <output>
-                <port id="1">__SRC_DIMS_1__
-                </port>
-            </output>
-        </layer>
-        <layer name="in2" type="Input" precision="FP32" id="2">
-            <output>
-                <port id="2">__SRC_DIMS_2__
-                </port>
-            </output>
-        </layer>
-        <layer name="in3" type="Input" precision="FP32" id="3">
-            <output>
-                <port id="3">__SRC_DIMS_3__
-                </port>
-            </output>
-        </layer>
-        <layer name="con" id="4" type="Eltwise" precision="FP32">
-            <data operation="_OP_" _COEFF_/>
-            <input>
-                <port id="1">__SRC_DIMS_1__
-                </port>
-                <port id="2">__SRC_DIMS_2__
-                </port>
-                <port id="3">__SRC_DIMS_3__
-                </port>
-            </input>
-            <output>
-                <port id="4">__SRC_DIMS__
-                </port>
-            </output>
-        </layer>
-    </layers>
-    <edges>
-        <edge from-layer="1" from-port="1" to-layer="4" to-port="1"/>
-        <edge from-layer="2" from-port="2" to-layer="4" to-port="2"/>
-        <edge from-layer="3" from-port="3" to-layer="4" to-port="3"/>
-    </edges>
-</net>
-)V0G0N";
-
-protected:
-    std::string getModel(eltwise_test_params p) {
-        std::string model = model_t;
-        std::string op = select_op(p.op);
-
-        std::string src_dims1;
-        for (auto &dim : p.dims1) {
-            src_dims1 += "\n                    <dim>";
-            src_dims1 += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1);
-
-        std::string src_dims2;
-        for (auto &dim : p.dims2) {
-            src_dims2 += "\n                    <dim>";
-            src_dims2 += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2);
-
-        std::string src_dims3;
-        for (auto &dim : p.dims3) {
-            src_dims3 += "\n                    <dim>";
-            src_dims3 += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS_3__", src_dims3);
-
-        std::string src_dims;
-        std::vector<size_t> dims = p.dims1;
-        for (int i = 0; i < dims.size(); i++) {
-            dims[i] = std::max(p.dims1[i], p.dims2[i]);
-            dims[i] = std::max(dims[i], p.dims3[i]);
-        }
-        for (auto &dim : dims) {
-            src_dims += "\n                    <dim>";
-            src_dims += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims);
-
-        std::string scale;
-        if (!p.scales.empty()) {
-            scale = std::string("coeff=\"") + to_string_c_locale(p.scales) + std::string("\"");
-        }
-        REPLACE_WITH_STR(model, "_OP_", op);
-        REPLACE_WITH_STR(model, "_COEFF_", scale);
-
-        return model;
-    }
-
-    virtual void TearDown() {
-    }
-
-    virtual void SetUp() {
-        try {
-            TestsCommon::SetUp();
-            eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
-            std::string model = getModel(p);
-
-            InferenceEngine::Core core;
-            InferenceEngine::CNNNetwork network;
-            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
-            MKLDNNGraphTestClass graph;
-            graph.CreateGraph(network);
-
-            auto& nodes = graph.getNodes();
-            for (int i = 0; i < nodes.size(); i++) {
-                if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) {
-                    ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
-                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
-                        p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
-                    }
-                    ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
-                    ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
-                }
-            }
-            InferenceEngine::SizeVector dims_src1 = p.dims1;
-            InferenceEngine::Layout layout1 = InferenceEngine::ANY;
-            switch (p.dims1.size()) {
-                case 4:
-                    layout1 = InferenceEngine::NCHW;
-                    break;
-                case 5:
-                    layout1 = InferenceEngine::NCDHW;
-                    break;
-            }
-            InferenceEngine::SizeVector dims_src2 = p.dims2;
-            InferenceEngine::Layout layout2 = InferenceEngine::ANY;
-            switch (p.dims2.size()) {
-                case 4:
-                    layout2 = InferenceEngine::NCHW;
-                    break;
-                case 5:
-                    layout2 = InferenceEngine::NCDHW;
-                    break;
-            }
-            InferenceEngine::SizeVector dims_src3 = p.dims3;
-            InferenceEngine::Layout layout3 = InferenceEngine::ANY;
-            switch (p.dims3.size()) {
-                case 4:
-                    layout3 = InferenceEngine::NCHW;
-                    break;
-                case 5:
-                    layout3 = InferenceEngine::NCDHW;
-                    break;
-            }
-
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src1, layout1});
-            src1->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
-
-            if (srcPtr1 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-            CommonTestUtils::fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1);
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src2, layout2});
-            src2->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
-
-            if (srcPtr2 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-            CommonTestUtils::fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2);
-            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src3, layout3});
-            src3->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
-
-            if (srcPtr3 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-            CommonTestUtils::fill_data_sine(src3->buffer(), src3->size(), 0.1, 0.9, 3);
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in3", src3));
-
-            InferenceEngine::OutputsDataMap out;
-            out = network.getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
-
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output->allocate();
-            outputBlobs[item.first] = output;
-
-            graph.Infer(srcs, outputBlobs);
-
-            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
-            dst_ref.allocate();
-
-            std::vector<InferenceEngine::TBlob<float>> src_vec = {*srcPtr1, *srcPtr2, *srcPtr3};
-
-            ref_eltwise(src_vec, dst_ref, p);
-
-            compare(*output, dst_ref, 0.0005f);
-        } catch (const InferenceEngine::details::InferenceEngineException &e) {
-            FAIL() << e.what();
-        }
-    }
-};
-
-TEST_P(MKLDNNGraphEltwise3InputsTests, TestsEltwise) {}
-
-
-INSTANTIATE_TEST_CASE_P(
-        TestsEltwise, MKLDNNGraphEltwise3InputsTests,
-        ::testing::Values(
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
-                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }
-                } },
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
-                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }
-                } },
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
-                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }
-                } },
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
-                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }
-                } },
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
-                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }
-                } },
-                eltwise_test_params{{1, 32, 16, 16, 16},{1, 32, 16, 16, 16},{1, 32, 16, 16, 16}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
-                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(1).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(2).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }
-                } },
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref}
-        ));
-
-class MKLDNNGraphEltwise2InputsTests: public TestsCommon,
-                                     public WithParamInterface<eltwise_test_params> {
-    std::string model_t = R"V0G0N(
-<net name="EltwiseOnly" version="2" precision="FP32">
-    <layers>
-        <layer name="in1" type="Input" precision="FP32" id="1">
-            <output>
-                <port id="1">__SRC_DIMS_1__
-                </port>
-            </output>
-        </layer>
-        <layer name="in2" type="Input" precision="FP32" id="2">
-            <output>
-                <port id="2">__SRC_DIMS_2__
-                </port>
-            </output>
-        </layer>
-        <layer name="con" id="3" type="Eltwise" precision="FP32">
-            <data operation="_OP_" _COEFF_/>
-            <input>
-                <port id="1">__SRC_DIMS_1__
-                </port>
-                <port id="2">__SRC_DIMS_2__
-                </port>
-            </input>
-            <output>
-                <port id="3">__SRC_DIMS__
-                </port>
-            </output>
-        </layer>
-    </layers>
-    <edges>
-        <edge from-layer="1" from-port="1" to-layer="3" to-port="1"/>
-        <edge from-layer="2" from-port="2" to-layer="3" to-port="2"/>
-    </edges>
-</net>
-)V0G0N";
-
-protected:
-    std::string getModel(eltwise_test_params p) {
-        std::string model = model_t;
-        std::string op = select_op(p.op);
-
-        std::string src_dims1 = "";
-        for (auto &dim : p.dims1) {
-            src_dims1 += "\n                    <dim>";
-            src_dims1 += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1);
-
-        std::string src_dims2 = "";
-        for (auto &dim : p.dims2) {
-            src_dims2 += "\n                    <dim>";
-            src_dims2 += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2);
-
-        std::string src_dims;
-        std::vector<size_t> dims = (p.dims1.size() >= p.dims2.size()) ? p.dims1 : p.dims2;
-        int i = dims.size() - 1, j = p.dims1.size() - 1, k = p.dims2.size() - 1;
-        for (; j >= 0 && k >= 0; i--, j--, k-- ) {
-            dims[i] = std::max(p.dims1[j], p.dims2[k]);
-        }
-
-        for (auto &dim : dims) {
-            src_dims += "\n                    <dim>";
-            src_dims += std::to_string(dim) + "</dim>";
-        }
-        REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims);
-
-        std::string scale;
-        if (!p.scales.empty()) {
-            scale = std::string("coeff=\"") + to_string_c_locale(p.scales) + std::string("\"");
-        }
-        REPLACE_WITH_STR(model, "_OP_", op);
-        REPLACE_WITH_STR(model, "_COEFF_", scale);
-
-        return model;
-    }
-
-    virtual void TearDown() {
-    }
-
-    virtual void SetUp() {
-        try {
-            TestsCommon::SetUp();
-            eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
-            std::string model = getModel(p);
-
-            InferenceEngine::Core core;
-            InferenceEngine::CNNNetwork network;
-            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
-            MKLDNNGraphTestClass graph;
-            graph.CreateGraph(network);
-
-            auto& nodes = graph.getNodes();
-            for (int i = 0; i < nodes.size(); i++) {
-                if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) {
-                    ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
-                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
-                        p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
-                    }
-                    ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
-                    ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
-                }
-            }
-            InferenceEngine::SizeVector dims_src1 = p.dims1;
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src1, InferenceEngine::TensorDesc::getLayoutByDims(p.dims1) });
-            src1->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
-
-            if (srcPtr1 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-
-            CommonTestUtils::fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1);
-
-            InferenceEngine::SizeVector dims_src2 = p.dims2;
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src2, InferenceEngine::TensorDesc::getLayoutByDims(p.dims2) });
-            src2->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
-
-            if (srcPtr2 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-
-            CommonTestUtils::fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2);
-
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
-
-            InferenceEngine::OutputsDataMap out;
-            out = network.getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
-
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output->allocate();
-            outputBlobs[item.first] = output;
-
-            graph.Infer(srcs, outputBlobs);
-
-            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
-            dst_ref.allocate();
-
-            std::vector<InferenceEngine::TBlob<float>> src_vec = {*srcPtr1, *srcPtr2};
-
-            ref_eltwise(src_vec, dst_ref, p);
-
-            compare(*output, dst_ref, 0.0005f);
-        } catch (const InferenceEngine::details::InferenceEngineException &e) {
-            FAIL() << e.what();
-        }
-    }
-
-};
-
-TEST_P(MKLDNNGraphEltwise2InputsTests, TestsEltwise) {}
-
-INSTANTIATE_TEST_CASE_P(
-        TestsEltwise, MKLDNNGraphEltwise2InputsTests,
-        ::testing::Values(
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Less, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Less_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Greater, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Greater_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Equal, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Not_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref}
-        ));
-
-INSTANTIATE_TEST_CASE_P(
-        TestsBroadcasting, MKLDNNGraphEltwise2InputsTests,
-        ::testing::Values(
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Prod, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Max, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Min, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sub, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Div, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                //  batch broadcasting
-                eltwise_test_params{{1, 3, 224},{224, 3, 1},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{2, 3, 1, 2},{1, 3, 2, 1},{}, eltwise_test_params::opType::Sub, "", 1, MKLDNNPlugin::impl_desc_type::ref}
-
-        ));
-
-INSTANTIATE_TEST_CASE_P(
-        TestsDiffDims, MKLDNNGraphEltwise2InputsTests,
-        ::testing::Values(
-                eltwise_test_params{{},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3},{3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3},{3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}
-        ));
-
-class MKLDNNGraphEltwiseDynBatchTests: public MKLDNNGraphEltwise3InputsTests {
-protected:
-    virtual void SetUp() {
-        try {
-            TestsCommon::SetUp();
-            eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
-            std::string model = getModel(p);
-            size_t MB = p.dims1[0];
-            if (MB < 2)
-                MB = 2;
-
-            InferenceEngine::Core core;
-            InferenceEngine::CNNNetwork network;
-            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
-            auto implNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(&((InferenceEngine::ICNNNetwork&)network));
-            ASSERT_NE(nullptr, implNet) << "Failed to cast ICNNNetwork to CNNNetworkImpl";
-            InferenceEngine::ResponseDesc resp;
-            InferenceEngine::StatusCode sts  = implNet->setBatchSizeReshape(MB, &resp);
-            ASSERT_EQ((int)InferenceEngine::StatusCode::OK, sts) << resp.msg;
-
-            MKLDNNGraphTestClass graph;
-            graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
-            graph.CreateGraph(network);
-
-            InferenceEngine::SizeVector dims_src1 = p.dims1;
-            InferenceEngine::Layout layout1 = InferenceEngine::ANY;
-            switch (p.dims1.size()) {
-                case 4:
-                    layout1 = InferenceEngine::NCHW;
-                    break;
-                case 5:
-                    layout1 = InferenceEngine::NCDHW;
-                    break;
-            }
-            InferenceEngine::SizeVector dims_src2 = p.dims2;
-            InferenceEngine::Layout layout2 = InferenceEngine::ANY;
-            switch (p.dims2.size()) {
-                case 4:
-                    layout2 = InferenceEngine::NCHW;
-                    break;
-                case 5:
-                    layout2 = InferenceEngine::NCDHW;
-                    break;
-            }
-            InferenceEngine::SizeVector dims_src3 = p.dims3;
-            InferenceEngine::Layout layout3 = InferenceEngine::ANY;
-            switch (p.dims3.size()) {
-                case 4:
-                    layout3 = InferenceEngine::NCHW;
-                    break;
-                case 5:
-                    layout3 = InferenceEngine::NCDHW;
-                    break;
-            }
-
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src1, layout1});
-            src1->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
-
-            if (srcPtr1 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-
-            fill_data(src1->buffer(), src1->size());
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src2, layout2});
-            src2->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
-
-            if (srcPtr2 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-            fill_data(src2->buffer(), src2->size());
-            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src3, layout3});
-            src3->allocate();
-
-            InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
-
-            if (srcPtr3 == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-            fill_data(src3->buffer(), src3->size());
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in3", src3));
-
-            InferenceEngine::OutputsDataMap out;
-            out = network.getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
-
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output->allocate();
-            outputBlobs[item.first] = output;
-
-
-            auto checkDepthwise = [](const MKLDNNPlugin::MKLDNNNodePtr& node) {
-                return node->getType() == MKLDNNPlugin::Eltwise;
-            };
-
-            graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkDepthwise);
-            graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkDepthwise);
-        } catch (const InferenceEngine::details::InferenceEngineException &e) {
-            FAIL() << e.what();
-        }
-    }
-};
-
-TEST_P(MKLDNNGraphEltwiseDynBatchTests, TestsDynBatchEltwise) {}
-
-// TODO: rewrite to ngraph to have reshape functionality
-INSTANTIATE_TEST_CASE_P(
-        DISABLED_TestsDynBatchEltwise, MKLDNNGraphEltwiseDynBatchTests,
-        ::testing::Values(
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Pow, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
-                eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref}
-                ));
-
 struct precisions_test_2params {
     struct {
         std::string precision0;
@@ -1022,7 +373,7 @@ INSTANTIATE_TEST_CASE_P(
         TestsEltwise2Precisions, MKLDNNGraphEltwise2PrecisionsTests,
         ::testing::Values(
             precisions_test_2params{ {"FP32", "FP32"}, 4, 0 },
-            precisions_test_2params{ {  "U8", "FP32"}, 5, 1 },
-            precisions_test_2params{ {"FP32",   "U8"}, 5, 1 },
-            precisions_test_2params{ {  "U8",   "U8"}, 6, 2 }
+            precisions_test_2params{ {  "U8", "FP32"}, 4, 0 },
+            precisions_test_2params{ {"FP32",   "U8"}, 4, 0 },
+            precisions_test_2params{ {  "U8",   "U8"}, 4, 0 }
         ));
index cf0650b..84b5a08 100644 (file)
@@ -116,13 +116,12 @@ protected:
             graph.CreateGraph(network);
             auto& nodes = graph.getNodes();
             for (int i = 0; i < nodes.size(); i++) {
-                if (nodes[i]->getType() == MKLDNNPlugin::Power) {
+                if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) {
                     ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
                     for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
                         p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
                     }
                     ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
-                    ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
                 }
             }
 
@@ -174,25 +173,16 @@ INSTANTIATE_TEST_CASE_P(
                 power_test_params{
                         {1, 3, 13, 13}, 1, 2, 0.5f, 3, MKLDNNPlugin::impl_desc_type::unknown, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                                    ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
                                     ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                                    ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                                    ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 },
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                                    ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
                                     ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 },
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                                    ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
                                     ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 }}},
                 power_test_params{{1, 1, 23, 23}, 3, 8, 2, 3 },
                 power_test_params{{1, 8, 23, 23}, 8, 2, 1, 3 },
@@ -306,7 +296,7 @@ protected:
             outputBlobs[item.first] = output;
 
             auto checkPower = [](const MKLDNNPlugin::MKLDNNNodePtr& node) {
-                return node->getType() == MKLDNNPlugin::Power;
+                return node->getType() == MKLDNNPlugin::Eltwise;
             };
             graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkPower);
             graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkPower);
@@ -325,25 +315,16 @@ INSTANTIATE_TEST_CASE_P(
                 power_test_params{
                         {1, 3, 13, 13}, 1, 2, 0.5f, 3, MKLDNNPlugin::impl_desc_type::unknown, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                                    ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
                                     ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                                    ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                                    ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 },
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                                    ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
                                     ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 },
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                                    ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
                                     ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
-                                    ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 }}},
                 power_test_params{{1, 1, 23, 23}, 3, 8, 2, 3 },
                 power_test_params{{1, 8, 23, 23}, 8, 2, 1, 3 },
index 3752e32..4f46fa5 100644 (file)
@@ -257,14 +257,14 @@ protected:
                 ASSERT_EQ(nodes.size(), 3);
                 ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
                 ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution);
-                ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+                ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
                 ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Output);
             } else {
                 ASSERT_EQ(nodes.size(), 5);
                 ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
                 ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Reorder);
                 ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Convolution);
-                ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+                ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
                 ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reorder);
                 ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output);
             }
index 6e83b9b..301048d 100644 (file)
@@ -186,10 +186,9 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReorders) {
     for (auto &node : nodes) {
         if (node->getType() == MKLDNNPlugin::Reorder) {
             reorders_num++;
-            ASSERT_EQ(MKLDNNPlugin::Output, node->getChildEdgeAt(0)->getChild()->getType());
         }
     }
-    ASSERT_EQ(reorders_num, 1);
+    ASSERT_EQ(reorders_num, 3);
 }
 
 TEST_F(MKLDNNGraphStructureTests, TestRedundantReorderBeforeConvWithC_3) {
@@ -3781,7 +3780,7 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersForXceptionTopology) {
     weights->allocate();
     fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
     InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-    
+
     InferenceEngine::Core core;
     InferenceEngine::CNNNetwork network;
     ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
@@ -4020,7 +4019,7 @@ TEST_F(MKLDNNGraphStructureTests, TestFailedPartPlateRecognitionBarrier0001) {
     fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
 
     InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-    
+
     InferenceEngine::Core core;
     InferenceEngine::CNNNetwork network;
     ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
@@ -4629,7 +4628,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConvolutionDWConvolutionSumFusing) {
     memset((float *) weights->buffer(), 0, weights->size());
 
     InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-    
+
     InferenceEngine::Core core;
     InferenceEngine::CNNNetwork network;
     network = core.ReadNetwork(model, weights_ptr);
@@ -5127,7 +5126,7 @@ TEST_F(MKLDNNGraphStructureTests, TestGemmConvolutionWithConcat) {
     weights->allocate();
     fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
     InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-    
+
     InferenceEngine::Core core;
     InferenceEngine::CNNNetwork network;
     ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
@@ -5412,7 +5411,7 @@ TEST_F(MKLDNNGraphStructureTests, TestRefPoolingWithConcat) {
     weights->allocate();
     fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
     InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-    
+
     InferenceEngine::Core core;
     InferenceEngine::CNNNetwork network;
     ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
@@ -5566,7 +5565,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConvolutionWith2DepthwiseOpFusing) {
     ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
     ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Reorder);
     ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Convolution);
-    ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+    ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
     ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reorder);
     ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output);
 
@@ -5704,7 +5703,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConvolutionWith2EltwiseOpFusing) {
     ASSERT_EQ(nodes.size(), 4);
     ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
     ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution);
-    ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Activation));
+    ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
     ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Reorder);
     ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Output);
 
@@ -5846,7 +5845,7 @@ TEST_F(MKLDNNGraphStructureTests, TestGemmConvolutionWith2DepthwiseOpFusing) {
     ASSERT_EQ(nodes.size(), 3);
     ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
     ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution);
-    ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+    ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
     ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Output);
 
     InferenceEngine::TensorDesc src_desc(InferenceEngine::Precision::FP32, {1, 8, 300, 600}, InferenceEngine::NCHW);
index 27ed5b3..7d381f4 100644 (file)
@@ -27,6 +27,7 @@
 #include <nodes/mkldnn_input_node.h>
 #include <functional>
 #include <cmath>
+#include <legacy/details/ie_cnn_network_tools.h>
 
 #define GARB_VAL(x) ((x + 100.0f + sin(x)) / (x + 150.f))
 
@@ -212,13 +213,66 @@ public:
         return graphNodes;
     }
 
+    void MoveInternalBlobsToConstLayers(InferenceEngine::details::CNNNetworkImpl* netImpl) {
+        auto createConstInputTo = [&](InferenceEngine::CNNLayerPtr layer, InferenceEngine::Blob::Ptr blob, std::string name) {
+            InferenceEngine::LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", InferenceEngine::Precision::FP32};
+            auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
+            constLayer->blobs["custom"] = blob;
+
+            std::vector<size_t> constDims(layer->insData[0].lock()->getDims().size(), 1);
+            if (constDims.size() > 1)
+                constDims[1] = blob.get()->size();
+            else
+                constDims[0] = blob.get()->size();
+            const InferenceEngine::TensorDesc& td = {InferenceEngine::Precision::FP32, constDims, InferenceEngine::TensorDesc::getLayoutByDims(constDims)};
+
+            InferenceEngine::DataPtr newEdgeAfterLayer(new InferenceEngine::Data(constLayer->name, td));
+            newEdgeAfterLayer->setName(constLayer->name);
+            getCreatorLayer(newEdgeAfterLayer) = constLayer;
+            getInputTo(newEdgeAfterLayer).clear();
+
+
+            netImpl->addData(constLayer->name.c_str(), newEdgeAfterLayer);
+            IE_SUPPRESS_DEPRECATED_START
+            netImpl->addLayer(constLayer);
+            IE_SUPPRESS_DEPRECATED_END
+
+            constLayer->outData.push_back(newEdgeAfterLayer);
+            getInputTo(newEdgeAfterLayer)[layer->name] = layer;
+            layer->insData.push_back(newEdgeAfterLayer);
+        };
+
+        auto all_layers = InferenceEngine::details::CNNNetSortTopologically(*netImpl);
+        for (auto &layer : all_layers) {
+            if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
+                InferenceEngine::Blob::Ptr scalesBlob = layer->blobs["weights"];
+                if (scalesBlob != nullptr)
+                    createConstInputTo(layer, scalesBlob, "weights");
+
+                InferenceEngine::Blob::Ptr shiftBlob = layer->blobs["biases"];
+                if (shiftBlob != nullptr)
+                    createConstInputTo(layer, shiftBlob, "biases");
+            } else if (layer->type == "PReLU" && layer->insData.size() == 1) {
+                InferenceEngine::Blob::Ptr scalesBlob = layer->blobs["weights"];
+                if (scalesBlob != nullptr)
+                    createConstInputTo(layer, scalesBlob, "weights");
+            }
+        }
+    }
+
     void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNPlugin::MKLDNNExtensionManager::Ptr& extMgr,
             MKLDNNPlugin::MKLDNNWeightsSharing::Ptr cache = {}) {
         if (network.getFunction()) {
             auto convertedNetwork = std::make_shared<InferenceEngine::details::CNNNetworkImpl>(network);
+            MoveInternalBlobsToConstLayers(convertedNetwork.get());
             MKLDNNGraph::CreateGraph(static_cast<InferenceEngine::ICNNNetwork&>(*convertedNetwork),
-                extMgr, cache);            
+                extMgr, cache);
         } else {
+            InferenceEngine::details::CNNNetworkImpl* netImpl = dynamic_cast<InferenceEngine::details::CNNNetworkImpl*>(&network);
+            if (netImpl == nullptr) {
+                THROW_IE_EXCEPTION << "unexpected network type";
+            }
+            MoveInternalBlobsToConstLayers(netImpl);
             MKLDNNGraph::CreateGraph(network, extMgr, cache);
         }
     }
@@ -227,9 +281,15 @@ public:
         MKLDNNPlugin::MKLDNNWeightsSharing::Ptr cache;
         if (network.getFunction()) {
             auto convertedNetwork = std::make_shared<InferenceEngine::details::CNNNetworkImpl>(network);
+            MoveInternalBlobsToConstLayers(convertedNetwork.get());
             MKLDNNGraph::CreateGraph(static_cast<InferenceEngine::ICNNNetwork&>(*convertedNetwork),
                 extensionManager, cache);            
         } else {
+            InferenceEngine::details::CNNNetworkImpl* netImpl = dynamic_cast<InferenceEngine::details::CNNNetworkImpl*>(&network);
+            if (netImpl == nullptr) {
+                THROW_IE_EXCEPTION << "unexpected network type";
+            }
+            MoveInternalBlobsToConstLayers(netImpl);
             MKLDNNGraph::CreateGraph(network, extensionManager, cache);
         }
     }
index 4b23902..d7d8ed4 160000 (submodule)
@@ -1 +1 @@
-Subproject commit 4b239023043318899e1c0a3b79158a68b7efe6e4
+Subproject commit d7d8ed46078b637794bc91215e1a982bb0f1683a
index 34e3ecb..56ac242 100644 (file)
@@ -115,11 +115,6 @@ xfail_issue_38084 = xfail_test(reason="RuntimeError: AssertionFailed: layer->get
 xfail_issue_38085 = xfail_test(reason="RuntimeError: Interpolate operation should be converted to Interp")
 xfail_issue_38086 = xfail_test(reason="RuntimeError: Quantize layer input '<value>' doesn't have blobs")
 xfail_issue_38087 = xfail_test(reason="RuntimeError: Cannot cast to tensor desc. Format is unsupported!")
-xfail_issue_38088 = xfail_test(reason="RuntimeError: Check '((axis >= axis_range_min) && "
-                                      "(axis <= axis_range_max))' failed at "
-                                      "/openvino/ngraph/core/src/validation_util.cpp:913: "
-                                      "Split Parameter axis <value> out of the tensor rank range <value>.")
-xfail_issue_38089 = xfail_test(reason="RuntimeError: Node 2 contains empty child edge for index 0")
 xfail_issue_38090 = xfail_test(reason="AssertionError: Items types are not equal")
 xfail_issue_38091 = xfail_test(reason="AssertionError: Mismatched elements")
 xfail_issue_38699 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations:"
index e82c678..3fb616c 100644 (file)
@@ -22,7 +22,6 @@ from tests import (xfail_issue_34323,
                    skip_segfault,
                    xfail_issue_34327,
                    xfail_issue_36485,
-                   xfail_issue_35923,
                    xfail_issue_36486,
                    xfail_issue_34314,
                    xfail_issue_36487)
@@ -418,7 +417,6 @@ def test_grn_operator():
     assert np.allclose(result, expected)
 
 
-@xfail_issue_35923
 def test_prelu_operator():
     runtime = get_runtime()
 
index 6da6151..a72bca3 100644 (file)
@@ -38,7 +38,6 @@ from tests import (BACKEND_NAME,
                    xfail_issue_33616,
                    xfail_issue_38086,
                    xfail_issue_38087,
-                   xfail_issue_35923,
                    xfail_issue_36483,
                    xfail_issue_34323,
                    xfail_issue_35915,
@@ -46,8 +45,6 @@ from tests import (BACKEND_NAME,
                    xfail_issue_36476,
                    xfail_issue_36478,
                    xfail_issue_36437,
-                   xfail_issue_38088,
-                   xfail_issue_38089,
                    xfail_issue_38090,
                    xfail_issue_38091,
                    xfail_issue_35929,
@@ -220,9 +217,6 @@ tests_expected_to_fail = [
         "OnnxBackendNodeModelTest.test_quantizelinear_cpu"),
     (xfail_issue_38087,
         "OnnxBackendNodeModelTest.test_convtranspose_1d_cpu"),
-    (xfail_issue_35923,
-        "OnnxBackendNodeModelTest.test_prelu_broadcast_cpu",
-        "OnnxBackendNodeModelTest.test_prelu_example_cpu"),
     (xfail_issue_36483,
         "OnnxBackendNodeModelTest.test_ceil_cpu",
         "OnnxBackendNodeModelTest.test_ceil_example_cpu"),
@@ -286,10 +280,6 @@ tests_expected_to_fail = [
         "OnnxBackendNodeModelTest.test_argmin_keepdims_example_select_last_index_cpu",
         "OnnxBackendNodeModelTest.test_argmin_keepdims_random_select_last_index_cpu",
         "OnnxBackendNodeModelTest.test_pow_types_float32_uint32_cpu"),
-    (xfail_issue_38088,
-        "OnnxBackendPyTorchConvertedModelTest.test_GLU_cpu"),
-    (xfail_issue_38089,
-        "OnnxBackendPyTorchConvertedModelTest.test_GLU_dim_cpu"),
     (xfail_issue_38090,
         "OnnxBackendNodeModelTest.test_where_long_example_cpu",
         "OnnxBackendNodeModelTest.test_mod_int64_fmod_cpu",
index 246b52f..bbd6857 100644 (file)
@@ -18,7 +18,6 @@ import onnx
 import pytest
 
 from tests.test_onnx.utils import run_node
-from tests import xfail_issue_35915
 
 
 @pytest.mark.parametrize(
@@ -27,9 +26,9 @@ from tests import xfail_issue_35915
         pytest.param("And", np.logical_and, np.bool),
         pytest.param("Or", np.logical_or, np.bool),
         pytest.param("Xor", np.logical_xor, np.bool),
-        pytest.param("Equal", np.equal, np.int32, marks=xfail_issue_35915),
-        pytest.param("Greater", np.greater, np.int32, marks=xfail_issue_35915),
-        pytest.param("Less", np.less, np.int32, marks=xfail_issue_35915),
+        pytest.param("Equal", np.equal, np.int32),
+        pytest.param("Greater", np.greater, np.int32),
+        pytest.param("Less", np.less, np.int32),
     ],
 )
 def test_logical(onnx_op, numpy_func, data_type):
index 7bb55e0..d1c8a2a 100644 (file)
@@ -18,7 +18,7 @@ import onnx
 import pytest
 
 from tests.test_onnx.utils import run_node
-from tests import xfail_issue_35918, xfail_issue_35923, xfail_issue_35924
+from tests import xfail_issue_35918, xfail_issue_35924
 
 
 def import_and_compute(op_type, input_data, **node_attrs):
@@ -71,7 +71,6 @@ def test_leaky_relu():
     assert_onnx_import_equals_callable("LeakyRelu", leaky_relu, [[-3, -2, -1], [1, 2, 3]])
 
 
-@xfail_issue_35923
 @pytest.mark.parametrize(
     "x, slope",
     [