From abb8817cf6952ca60ece886a1affded3bb876db4 Mon Sep 17 00:00:00 2001 From: Gorokhov Dmitriy Date: Wed, 28 Oct 2020 09:16:28 +0300 Subject: [PATCH] [CPU] Generic JIT Eltwise implementation (#1464) --- inference-engine/src/mkldnn_plugin/CMakeLists.txt | 6 +- .../src/mkldnn_plugin/mkldnn_descriptor.cpp | 13 - .../src/mkldnn_plugin/mkldnn_descriptor.h | 3 - .../src/mkldnn_plugin/mkldnn_exec_network.cpp | 74 +- .../src/mkldnn_plugin/mkldnn_graph_optimizer.cpp | 784 ++--- .../src/mkldnn_plugin/mkldnn_graph_optimizer.h | 10 +- inference-engine/src/mkldnn_plugin/mkldnn_node.cpp | 40 +- inference-engine/src/mkldnn_plugin/mkldnn_node.h | 3 - .../src/mkldnn_plugin/mkldnn_plugin.cpp | 2 + .../src/mkldnn_plugin/nodes/common/emitter.cpp | 200 ++ .../src/mkldnn_plugin/nodes/common/emitter.h | 128 + .../mkldnn_plugin/nodes/jit_eltwise_emitters.cpp | 1417 ++++++++ .../mkldnn_plugin/nodes/jit_eltwise_emitters.hpp | 417 +++ .../mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp | 70 + .../mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp | 32 + .../mkldnn_plugin/nodes/mkldnn_activation_node.cpp | 252 -- .../mkldnn_plugin/nodes/mkldnn_activation_node.h | 44 - .../mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp | 1 - .../mkldnn_plugin/nodes/mkldnn_batchnorm_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp | 97 +- .../src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp | 8 +- .../src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp | 115 +- .../mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp | 2 - .../mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp | 353 -- .../mkldnn_plugin/nodes/mkldnn_depthwise_node.h | 46 - .../mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp | 3637 +++++++------------- .../src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h | 189 +- .../nodes/mkldnn_fullyconnected_node.cpp | 28 +- .../nodes/mkldnn_interpolate_node.cpp | 86 +- .../src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp | 64 +- .../mkldnn_plugin/nodes/mkldnn_normalize_node.cpp | 70 +- .../src/mkldnn_plugin/nodes/mkldnn_power_node.cpp | 133 - .../src/mkldnn_plugin/nodes/mkldnn_power_node.h | 31 - .../mkldnn_plugin/nodes/mkldnn_quantize_node.cpp | 8 +- .../src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp | 3 +- .../mkldnn_plugin/nodes/mkldnn_resample_node.cpp | 64 +- .../nodes/mkldnn_scatter_update_node.cpp | 2 - .../single_layer_tests/eltwise.cpp | 7 +- .../shared_tests_instances/skip_tests_config.cpp | 1 - .../plugin/cpu/single_layer_tests/eltwise.cpp | 327 ++ .../cpu/subgraph_tests/src/eltwise_chain.cpp | 184 + .../shared/src/single_layer_tests/eltwise.cpp | 21 +- .../ngraph_functions/src/utils/ngraph_helpers.cpp | 3 + .../graph/layers/internal/graph_eltwise_test.cpp | 655 +--- .../graph/layers/internal/graph_power_test.cpp | 23 +- .../structure/graph_conv_depthwise_fusing_test.cpp | 4 +- .../graph/structure/graph_structure_test.cpp | 19 +- .../unit/engines/mkldnn/graph/test_graph.hpp | 62 +- inference-engine/thirdparty/mkl-dnn | 2 +- ngraph/python/tests/__init__.py | 5 - ngraph/python/tests/test_ngraph/test_ops_fused.py | 2 - ngraph/python/tests/test_onnx/test_backend.py | 10 - ngraph/python/tests/test_onnx/test_ops_logical.py | 7 +- .../python/tests/test_onnx/test_ops_nonlinear.py | 3 +- 54 files changed, 4764 insertions(+), 5005 deletions(-) create mode 100644 inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/common/emitter.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h create mode 100644 inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp create mode 100644 inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt index 2ed81eb..6557976 100644 --- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt +++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt @@ -9,7 +9,6 @@ if (WIN32) endif() set(LAYERS - ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_activation_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_batchnorm_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_bin_conv_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_concat_node.cpp @@ -17,7 +16,6 @@ set(LAYERS ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_crop_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_deconv_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_def_conv_node.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_depthwise_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_eltwise_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_fullyconnected_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_gemm_node.cpp @@ -27,7 +25,6 @@ set(LAYERS ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_memory_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_permute_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_pooling_node.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_power_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_quantize_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reorder_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reshape_node.cpp @@ -94,7 +91,10 @@ set(LAYERS ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unique.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unsqueeze.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/softmax.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/emitter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/interp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_eltwise_emitters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_mkldnn_emitters.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/argmax.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/argmax_imp.cpp diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp index d871e71..06f4074 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp @@ -150,19 +150,6 @@ MKLDNNDescriptor::operator std::shared_ptr() { return typeDesc->getPtr(); } -MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { - this->desc.reset(new DescFwdImpl(desc)); -} - -MKLDNNDescriptor::operator std::shared_ptr() { - DescFwdImpl *typeDesc = - dynamic_cast *>(desc.get()); - if (typeDesc == nullptr) { - THROW_IE_EXCEPTION << "Cannot cast descriptor!"; - } - return typeDesc->getPtr(); -} - MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { this->desc.reset(new DescFwdImpl(desc)); } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h index bbdc50c..cd59e17 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h @@ -37,9 +37,6 @@ public: explicit MKLDNNDescriptor(std::shared_ptr desc); operator std::shared_ptr(); - explicit MKLDNNDescriptor(std::shared_ptr desc); - operator std::shared_ptr(); - explicit MKLDNNDescriptor(std::shared_ptr desc); operator std::shared_ptr(); diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp index 2d0ca6e..f387b69 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp @@ -30,6 +30,7 @@ #include #include #include +#include using namespace MKLDNNPlugin; using namespace InferenceEngine; @@ -57,18 +58,17 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) { #ifdef USE_CNNNETWORK_LPT auto params = LayerTransformation::Params(true, // updatePrecisions - true, // quantizeOutputs - true, // weightsToConst - LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations - LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights - true, // roundQuantizedValues - true, // updateBiases - true); // supportAsymmetricQuantization + true, // quantizeOutputs + true, // weightsToConst + LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations + LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights + true, // roundQuantizedValues + true, // updateBiases + true); // supportAsymmetricQuantization LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params). add(LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), "Convolution"). - addCleanup( - LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), - "ScaleShift")); + remove("ScaleShift"). + remove("Power")); transformer.transform(*_clonedNetwork); #endif @@ -102,6 +102,59 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network MKLDNNGraph::ApplyUnrollPasses(static_cast(*_clonedNetwork)); + auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, std::string name) { + LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()}; + auto constLayer = std::make_shared(attrs); + constLayer->blobs["custom"] = blob; + + std::vector constDims(layer->insData[0].lock()->getDims().size(), 1); + if (constDims.size() > 1) + constDims[1] = blob.get()->size(); + else + constDims[0] = blob.get()->size(); + const TensorDesc& td = {blob->getTensorDesc().getPrecision(), constDims, TensorDesc::getLayoutByDims(constDims)}; + + DataPtr newEdgeAfterLayer(new Data(constLayer->name, td)); + newEdgeAfterLayer->setName(constLayer->name); + getCreatorLayer(newEdgeAfterLayer) = constLayer; + getInputTo(newEdgeAfterLayer).clear(); + + _clonedNetwork->addData(constLayer->name.c_str(), newEdgeAfterLayer); + IE_SUPPRESS_DEPRECATED_START + _clonedNetwork->addLayer(constLayer); + IE_SUPPRESS_DEPRECATED_END + + constLayer->outData.push_back(newEdgeAfterLayer); + getInputTo(newEdgeAfterLayer)[layer->name] = layer; + layer->insData.push_back(newEdgeAfterLayer); + }; + + auto all_layers = details::CNNNetSortTopologically(*_clonedNetwork); + for (auto &layer : all_layers) { + if (layer->type == "ScaleShift" && layer->insData.size() == 1) { + Blob::Ptr scalesBlob = layer->blobs["weights"]; + if (scalesBlob != nullptr) + createConstInputTo(layer, scalesBlob, "weights"); + + Blob::Ptr shiftBlob = layer->blobs["biases"]; + if (shiftBlob != nullptr) { + createConstInputTo(layer, shiftBlob, "biases"); + } else if (scalesBlob != nullptr) { + Blob::Ptr biases = make_shared_blob(scalesBlob->getTensorDesc()); + biases->allocate(); + auto biasesPtr = biases->buffer().as(); + for (size_t i = 0; i < biases->size(); i++) + biasesPtr[i] = 0; + + createConstInputTo(layer, biases, "biases"); + } + } else if (layer->type == "PReLU" && layer->insData.size() == 1) { + Blob::Ptr scalesBlob = layer->blobs["weights"]; + if (scalesBlob != nullptr) + createConstInputTo(layer, scalesBlob, "weights"); + } + } + if (_cfg.batchLimit > 1) { // check topology for applicability if (!CanProcessDynBatch(*_clonedNetwork)) { @@ -272,7 +325,6 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &n type != SoftMax && type != Split && type != Concatenation && - type != Power && type != Eltwise && type != Crop && type != BatchNormalization && diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index 41e17fb..ccdef34 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -6,10 +6,8 @@ #include "mkldnn_extension_utils.h" #include "nodes/mkldnn_reshape_node.h" -#include "nodes/mkldnn_activation_node.h" #include "nodes/mkldnn_pooling_node.h" #include "nodes/mkldnn_eltwise_node.h" -#include "nodes/mkldnn_depthwise_node.h" #include "nodes/mkldnn_concat_node.h" #include "nodes/mkldnn_reorder_node.h" #include "nodes/mkldnn_conv_node.h" @@ -18,6 +16,7 @@ #include "nodes/mkldnn_mvn_node.h" #include "nodes/mkldnn_resample_node.h" #include "nodes/mkldnn_interpolate_node.h" +#include "nodes/mkldnn_input_node.h" #include #include @@ -49,9 +48,6 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) { MergeTwoEqualScaleShifts(graph); graph.RemoveDroppedNodes(); - MergeSigmoidAndMultiplyToSwish(graph); - graph.RemoveDroppedNodes(); - MergeConversions(graph); graph.RemoveDroppedNodes(); @@ -70,20 +66,14 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) { FuseConvolutionAndZeroPoints(graph); graph.RemoveDroppedNodes(); -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) FuseConvolutionAndDepthwise(graph); graph.RemoveDroppedNodes(); -#endif -#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE) FuseConvolutionAndActivation(graph); graph.RemoveDroppedNodes(); -#endif -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) FuseConvolutionAndDepthwise(graph); graph.RemoveDroppedNodes(); -#endif FuseConvolutionAndQuantize(graph); graph.RemoveDroppedNodes(); @@ -91,10 +81,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) { graph.SortTopologically(); graph.RemoveDroppedEdges(); -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) FuseConvolutionAndDepthwise(graph); graph.RemoveDroppedNodes(); -#endif FusePoolingAndQuantize(graph); graph.RemoveDroppedNodes(); @@ -206,16 +194,6 @@ void MKLDNNGraphOptimizer::MergeConversions(MKLDNNGraph& graph) { } void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableConvNode = [](MKLDNNNodePtr node) { @@ -241,11 +219,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) { int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1]; if (parent0->getType() == Eltwise) { - auto * eltwiseLayer = dynamic_cast(parent0->getCnnLayer().get()); - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get eltwise layer " << node->getName(); - - if (eltwiseLayer->_operation != EltwiseLayer::Sub) + auto* eltwiseNode = dynamic_cast(parent0.get()); + if (eltwiseNode->getOpType() != Subtract) return false; if (parent0->getParentEdges().size() != 2) @@ -296,11 +271,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) { int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1]; if (parent0->getType() == Eltwise) { - auto * eltwiseLayer = dynamic_cast(parent0->getCnnLayer().get()); - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get eltwise layer " << node->getName(); - - if (eltwiseLayer->_operation != EltwiseLayer::Sub) + auto* eltwiseNode = dynamic_cast(parent0.get()); + if (eltwiseNode->getOpType() != Subtract) return false; if (parent0->getParentEdges().size() != 2) @@ -482,17 +454,17 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) { auto& graphNodes = graph.GetNodes(); auto isSutableScaleShiftNode = [](MKLDNNNodePtr node) { - if (node->getType() != Depthwise) + if (node->getType() != Eltwise) return false; - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Depthwise node"; + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Eltwise node"; - if (depthwiseNode->getChildEdges().size() != 1) + if (eltwiseNode->getChildEdges().size() != 1) return false; - if (depthwiseNode->getAlgorithm() != depthwise_scale_shift || depthwiseNode->isBroadcast()) + if (eltwiseNode->getOpType() != MulAdd) return false; return true; @@ -502,16 +474,16 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) { if (node1->getParentEdgeAt(0) != node2->getParentEdgeAt(0)) return false; - auto *depthwiseNode1 = dynamic_cast(node1.get()); - auto *depthwiseNode2 = dynamic_cast(node2.get()); + auto *eltwiseNode1 = dynamic_cast(node1.get()); + auto *eltwiseNode2 = dynamic_cast(node2.get()); - auto depthwiseLayer1 = depthwiseNode1->getCnnLayer(); - auto depthwiseLayer2 = depthwiseNode2->getCnnLayer(); + auto eltwiseLayer1 = eltwiseNode1->getCnnLayer(); + auto eltwiseLayer2 = eltwiseNode2->getCnnLayer(); - Blob::Ptr scalesBlob1 = depthwiseLayer1->blobs["weights"]; - Blob::Ptr shiftsBlob1 = depthwiseLayer1->blobs["biases"]; - Blob::Ptr scalesBlob2 = depthwiseLayer2->blobs["weights"]; - Blob::Ptr shiftsBlob2 = depthwiseLayer2->blobs["biases"]; + Blob::Ptr scalesBlob1 = eltwiseLayer1->blobs["weights"]; + Blob::Ptr shiftsBlob1 = eltwiseLayer1->blobs["biases"]; + Blob::Ptr scalesBlob2 = eltwiseLayer2->blobs["weights"]; + Blob::Ptr shiftsBlob2 = eltwiseLayer2->blobs["biases"]; if (scalesBlob1 == nullptr || shiftsBlob1 == nullptr || scalesBlob2 == nullptr || shiftsBlob2 == nullptr) return false; @@ -533,6 +505,16 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) { auto MergeScaleShiftNodes = [&](MKLDNNNodePtr childNode1, MKLDNNNodePtr childNode2) { auto parentNode = childNode2->getParentEdgeAt(0)->getParent(); auto ccNode2 = childNode2->getChildEdgeAt(0)->getChild(); + + auto parentEdges = childNode2->parentEdges; + for (auto &parentEdge : parentEdges) { + auto p_edge = parentEdge.lock(); + if (p_edge->getParent() == parentNode) + continue; + + removeEdge(graph, p_edge); + } + graph.DropNode(childNode2); MKLDNNEdgePtr remEdge; @@ -572,103 +554,6 @@ void MKLDNNGraphOptimizer::MergeTwoEqualScaleShifts(MKLDNNGraph& graph) { } } -void MKLDNNGraphOptimizer::MergeSigmoidAndMultiplyToSwish(MKLDNNGraph& graph) { - auto& graphNodes = graph.GetNodes(); - std::vector newNodes; - - MKLDNNNodePtr parentNode; - MKLDNNNodePtr activationNode, eltwiseNode; - MKLDNNEdgePtr remEdge; - - auto areSutableChildNodes = [&]() { - auto childNode1 = parentNode->getChildEdgeAt(0)->getChild(); - auto childNode2 = parentNode->getChildEdgeAt(1)->getChild(); - - if (childNode1->getType() == Activation && childNode2->getType() == Eltwise) { - activationNode = childNode1; - eltwiseNode = childNode2; - remEdge = parentNode->getChildEdgeAt(1); - } else if (childNode1->getType() == Eltwise && childNode2->getType() == Activation) { - activationNode = childNode2; - eltwiseNode = childNode1; - remEdge = parentNode->getChildEdgeAt(0); - } else { - return false; - } - - if (activationNode->getParentEdges().size() != 1 || activationNode->getChildEdges().size() != 1) - return false; - - if (eltwiseNode->getParentEdges().size() != 2) - return false; - - if (activationNode->getChildEdgeAt(0)->getChild() != eltwiseNode) - return false; - - auto *activationNodePtr = dynamic_cast(activationNode.get()); - if (activationNodePtr == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << activationNode->getName() << " to Activation node"; - if (activationNodePtr->getAlgorithm() != eltwise_logistic) - return false; - - auto *eltwiseNodePtr = dynamic_cast(eltwiseNode.get()); - if (eltwiseNodePtr == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << eltwiseNode->getName() << " to Eltwise node"; - auto *eltwiseLayer = dynamic_cast(eltwiseNode->getCnnLayer().get()); - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get eltwise layer " << eltwiseNode->getName(); - if (eltwiseLayer->_operation != EltwiseLayer::Prod) - return false; - - return true; - }; - - auto MergeToSwish = [&]() { - // 1. Remove edge Parent-Eltwise - remEdge->drop(); - graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), remEdge), graph.GetEdges().end()); - - // 2. Remove Sigmoid node and edges Parent-Sigmoid and Sigmoid-Eltwise - graph.DropNode(activationNode); - remEdge = parentNode->getChildEdgeAt(0); - auto oIndex = remEdge->getOutputNum(); - auto iIndex = remEdge->getInputNum(); - remEdge->drop(); - graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), remEdge), graph.GetEdges().end()); - - // 3. Create Swish node - CNNLayerPtr swishLayer(new CNNLayer(*activationNode->getCnnLayer().get())); - swishLayer->name = activationNode->getName() + "_Swish"; - swishLayer->type = "Swish"; - MKLDNNNodePtr swishNode(new MKLDNNActivationNode(swishLayer, graph.getEngine(), graph.weightsCache)); - - // 4. Create edges Parent-Swish and Swish-Eltwise, connect to Swish node, add edges to graph - MKLDNNEdgePtr beforeSwishEdge(new MKLDNNEdge(parentNode, swishNode, iIndex, 0)); - MKLDNNEdgePtr afterSwishEdge(new MKLDNNEdge(swishNode, eltwiseNode, 0, oIndex)); - swishNode->addEdge(beforeSwishEdge); - swishNode->addEdge(afterSwishEdge); - graph.GetEdges().push_back(beforeSwishEdge); - graph.GetEdges().push_back(afterSwishEdge); - newNodes.push_back(swishNode); - - // 5. Remove Eltwise node - graph.DropNode(eltwiseNode); - }; - - for (int i = 0; i < graphNodes.size(); i++) { - parentNode = graphNodes[i]; - if (parentNode->getChildEdges().size() != 2) - continue; - - if (!areSutableChildNodes()) continue; - - MergeToSwish(); - } - for (int i = 0; i < newNodes.size(); i++) { - graph.GetNodes().push_back(newNodes[i]); - } -} - void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) { auto &graphNodes = graph.GetNodes(); @@ -683,8 +568,18 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) { return x->getName() == node_name;}) == outputNodes.end()) { if (bn->getChildEdges().size() == 1) { auto child = bn->getChildEdgeAt(0)->getChild(); - if (child->type == Depthwise && child->getCnnLayer()->type == "ScaleShift") { + if (child->type == Eltwise && child->getCnnLayer()->type == "ScaleShift") { bn->fuseWith(child); + + auto parentEdges = child->parentEdges; + for (auto &parentEdge : parentEdges) { + auto p_edge = parentEdge.lock(); + if (p_edge->getParent()->getType() == BatchNormalization) + continue; + + removeEdge(graph, p_edge); + } + graph.DropNode(child); } } @@ -693,30 +588,19 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) { } } -#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE) void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) { - auto isOneOf = [&](mkldnn::algorithm alg, std::vector algs) { - for (auto a : algs) { - if (alg == a) { - return true; - } - } - return false; - }; - auto& graphNodes = graph.GetNodes(); auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) { if (!activation->getCnnLayer()) return false; - auto* activationNode = dynamic_cast(activation.get()); + auto* eltwiseNode = dynamic_cast(activation.get()); - return activationNode && - (activationNode->getAlgorithm() == eltwise_relu || + return eltwiseNode && + (eltwiseNode->getOpType() == Relu || (conv->getCnnLayer()->precision == Precision::FP32 && - isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp, - eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid}))); + IsOneOf(eltwiseNode->getOpType(), {Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid}))); }; for (int i = 0; i < graphNodes.size(); i++) { @@ -766,25 +650,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) { } void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph) { - auto isOneOf = [&](mkldnn::algorithm alg, std::vector algs) { - for (auto a : algs) { - if (alg == a) { - return true; - } - } - return false; - }; - - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -808,55 +673,52 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra quantizeNode->isOutputLowBroadcast() && quantizeNode->isOutputHighBroadcast() && !quantizeNode->isBinarization()); } - } else if (childNode->getType() == Depthwise) { - auto* depthwiseNode = dynamic_cast(childNode.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise layer " << childNode->getName(); + } else if (childNode->getType() == Eltwise) { + auto* eltwiseNode = dynamic_cast(childNode.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get Eltwise node " << childNode->getName(); - if (parentNode->getParentEdgesAtPort(0)[0]->getDims().ndims() != 3) { - return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && - depthwiseNode->isWithBiases()) || - (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu)); - } else { - const auto &depthwiseLayer = depthwiseNode->getCnnLayer(); - if (depthwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get scale shift layer " << depthwiseNode->getName(); - - if (depthwiseNode->getAlgorithm() != mkldnn::algorithm::depthwise_scale_shift) - return false; - - Blob::Ptr scalesBlob = depthwiseLayer->blobs["weights"]; - if (scalesBlob == nullptr) + if (IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})) { + return true; + } else if (IsOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu})) { + if (eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() != 2) return false; - Blob::Ptr shiftsBlob = depthwiseLayer->blobs["biases"]; - if (shiftsBlob == nullptr) - return false; + if (parentNode->getParentEdgesAtPort(0)[0]->getDims().ndims() != 3) { + return true; + } else { + const auto &eltwiseLayer = eltwiseNode->getCnnLayer(); + if (eltwiseLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot get scale shift layer " << eltwiseNode->getName(); - const float* scalesBufferPtr = scalesBlob->buffer().as(); - const float* shiftsBufferPtr = shiftsBlob->buffer().as(); + if (eltwiseNode->getOpType() != MulAdd) + return false; - if (scalesBlob->size() != shiftsBlob->size()) - return false; + Blob::Ptr scalesBlob = eltwiseLayer->blobs["weights"]; + if (scalesBlob == nullptr) + return false; - for (int i = 1; i < scalesBlob->size(); i++) - if (scalesBufferPtr[0] != scalesBufferPtr[i]) + Blob::Ptr shiftsBlob = eltwiseLayer->blobs["biases"]; + if (shiftsBlob == nullptr) return false; - for (int i = 1; i < shiftsBlob->size(); i++) - if (shiftsBufferPtr[0] != shiftsBufferPtr[i]) + const float *scalesBufferPtr = scalesBlob->buffer().as(); + const float *shiftsBufferPtr = shiftsBlob->buffer().as(); + + if (scalesBlob->size() != shiftsBlob->size()) return false; - return true; - } - } else if (childNode->getType() == Activation) { - auto* activationNode = dynamic_cast(childNode.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << childNode->getName(); + for (int i = 1; i < scalesBlob->size(); i++) + if (scalesBufferPtr[0] != scalesBufferPtr[i]) + return false; - return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic, - eltwise_bounded_relu, eltwise_clamp, eltwise_swish, eltwise_hswish, - eltwise_mish, eltwise_hsigmoid}); + for (int i = 1; i < shiftsBlob->size(); i++) + if (shiftsBufferPtr[0] != shiftsBufferPtr[i]) + return false; + + return true; + } + } } return false; @@ -878,7 +740,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra parentNode->fuseWith(childNode); - if (childNode->getType() == Quantize) { + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { auto parentEdges = childNode->parentEdges; for (auto &parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); @@ -892,9 +754,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra graph.DropNode(childNode); } } -#endif -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) { auto& graphNodes = graph.GetNodes(); @@ -906,17 +766,17 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) { }; auto isSutableChildNode = [](MKLDNNNodePtr node) { - if (node->getType() != Depthwise) + if (node->getType() != Eltwise) return false; if (!node->getCnnLayer()) return false; - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise node " << node->getName(); - return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) || - (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu)); + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName(); + return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) || + (eltwiseNode->getOpType() == Prelu)); }; for (int i = 0; i < graphNodes.size(); i++) { @@ -933,14 +793,32 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) { if (isSutableChildNode(depthwise1)) { conv->fuseWith(depthwise1); + + auto parents = depthwise1->parentEdges; + for (size_t j = 0; j < parents.size(); j++) { + auto p_edge = parents[j].lock(); + if (p_edge->getParent()->getType() == Eltwise) + continue; + + removeEdge(graph, p_edge); + } + graph.DropNode(depthwise1); } } + auto parents = depthwise0->parentEdges; + for (size_t j = 0; j < parents.size(); j++) { + auto p_edge = parents[j].lock(); + if (p_edge->getParent()->getType() == Convolution || p_edge->getParent()->getType() == BinaryConvolution) + continue; + + removeEdge(graph, p_edge); + } + graph.DropNode(depthwise0); } } -#endif void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) { auto& graphNodes = graph.GetNodes(); @@ -1088,16 +966,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) { #if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE) void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1151,25 +1019,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) { } void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) { - auto isOneOf = [&](mkldnn::algorithm alg, std::vector algs) { - for (auto a : algs) { - if (alg == a) { - return true; - } - } - return false; - }; - - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1188,21 +1037,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); return !quantizeNode->isBinarization(); - } else if (node->getType() == Depthwise) { - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName(); - - return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) || - (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu)); - } else if (node->getType() == Activation) { - auto* activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName(); + } else if (node->getType() == Eltwise) { + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName(); - return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_elu, eltwise_logistic, eltwise_bounded_relu, - eltwise_clamp, eltwise_swish, eltwise_hswish, eltwise_mish, - eltwise_hsigmoid}); + return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) || + (eltwiseNode->getOpType() == Prelu) || + IsOneOf(eltwiseNode->getOpType(), {Relu, Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})); } return false; @@ -1224,7 +1066,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) parentNode->fuseWith(childNode); - if (childNode->getType() == Quantize) { + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { auto parentEdges = childNode->parentEdges; for (auto &parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); @@ -1240,16 +1082,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) } void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1294,16 +1126,6 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph) } void MKLDNNGraphOptimizer::FusePoolingAndQuantize(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1427,30 +1249,16 @@ static bool is_data_dependency(const std::shared_ptr &parent, void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) { std::vector &graphNodes = graph.GetNodes(); - auto isOneOf = [&](mkldnn::algorithm alg, std::vector algs) { - for (auto a : algs) { - if (alg == a) { - return true; - } - } - return false; - }; - auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) { if (!activation->getCnnLayer()) return false; -#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE) - auto* activationNode = dynamic_cast(activation.get()); + auto* eltwiseNode = dynamic_cast(activation.get()); - return activationNode && - (activationNode->getAlgorithm() == eltwise_relu || + return eltwiseNode && + (eltwiseNode->getOpType() == Relu || (conv->getCnnLayer()->precision == Precision::FP32 && - isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp, - eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid}))); -#else - return false; -#endif + IsOneOf(eltwiseNode->getOpType(), {Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid}))); }; for (auto &graphNode : graphNodes) { @@ -1458,7 +1266,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG continue; if (!std::dynamic_pointer_cast(graphNode)->isSum()) continue; - if (!std::dynamic_pointer_cast(graphNode)->isUnitScales()) continue; if (std::dynamic_pointer_cast(graphNode)->isWithBroadcast()) continue; // TODO: Enlarge to several inputs @@ -1582,16 +1389,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG #endif void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1617,16 +1414,14 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) { if (quantizeNode == nullptr) THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); return !quantizeNode->isBinarization(); - } else if (node->getType() == Depthwise) { - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName(); - return depthwiseNode->cnnLayer->type == "ScaleShift"; - } else if (node->getType() == Activation) { - auto* activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName(); - return activationNode->getAlgorithm() == eltwise_relu; + } else if (node->getType() == Eltwise) { + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName(); + + return ((eltwiseNode->getOpType() == MulAdd) || + (eltwiseNode->getOpType() == Prelu) || + eltwiseNode->getOpType() == Relu); } return false; @@ -1648,7 +1443,7 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) { parentNode->fuseWith(childNode); - if (childNode->getType() == Quantize) { + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { auto parentEdges = childNode->parentEdges; for (auto &parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); @@ -1664,16 +1459,6 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) { } void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1699,16 +1484,12 @@ void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) { if (quantizeNode == nullptr) THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); return !quantizeNode->isBinarization(); - } else if (node->getType() == Depthwise) { - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName(); - return depthwiseNode->cnnLayer->type == "ScaleShift"; - } else if (node->getType() == Activation) { - auto* activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName(); - return activationNode->getAlgorithm() == eltwise_relu; + } else if (node->getType() == Eltwise) { + auto *eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName(); + return eltwiseNode->getOpType() == Relu || + eltwiseNode->getOpType() == MulAdd; } return false; @@ -1730,7 +1511,7 @@ void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) { parentNode->fuseWith(childNode); - if (childNode->getType() == Quantize) { + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { auto parentEdges = childNode->parentEdges; for (auto &parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); @@ -1746,16 +1527,6 @@ void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) { } void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) { - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](MKLDNNNodePtr node) { @@ -1798,7 +1569,7 @@ void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) parentNode->fuseWith(childNode); - if (childNode->getType() == Quantize || childNode->getType() == Depthwise || childNode->getType() == Activation) { + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { auto parentEdges = childNode->parentEdges; for (auto &parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); @@ -1814,25 +1585,6 @@ void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) } void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) { - auto isOneOf = [&](mkldnn::algorithm alg, std::vector algs) { - for (auto a : algs) { - if (alg == a) { - return true; - } - } - return false; - }; - - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { @@ -1854,20 +1606,16 @@ void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) { if (quantizeNode == nullptr) THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); return !quantizeNode->isBinarization(); - } else if (node->getType() == Depthwise) { - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName(); - return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) || - (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu)); - } else if (node->getType() == Activation) { - auto* activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName(); - return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic, - eltwise_bounded_relu, eltwise_clamp, eltwise_tanh, eltwise_swish, eltwise_hswish, eltwise_mish, - eltwise_hsigmoid, eltwise_linear, eltwise_abs, eltwise_square, eltwise_sqrt}); + } else if (node->getType() == Eltwise) { + auto *eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName(); + return IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Tanh, Swish, + Hswish, Mish, Hsigmoid, Linear, Abs, Square, Sqrt}) || + ((eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2) || + (eltwiseNode->getOpType() == Prelu)); } + return false; }; @@ -1887,7 +1635,7 @@ void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) { parentNode->fuseWith(childNode); - if (childNode->getType() == Quantize) { + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { auto parentEdges = childNode->parentEdges; for (auto &parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); @@ -1903,85 +1651,31 @@ void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) { } void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) { - auto isOneOf = [&](mkldnn::algorithm alg, std::vector algs) { - for (auto a : algs) { - if (alg == a) { - return true; - } - } - return false; - }; - - auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == edge) { - edges.erase(it); - return; - } - } - }; - auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { - bool isSutableEltwise = node->getType() == Eltwise; + return node->getType() == Eltwise && node->getChildEdges().size() == 1; + }; - if (isSutableEltwise) { - auto *eltwiseLayer = dynamic_cast(node->getCnnLayer().get()); - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get Eltwise layer " << node->getName(); + auto isSutableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) { + for (auto &childParentEdge : childNode->getParentEdges()) { + // WA to prevent unsupported reorder exception issue in some cases + if (childParentEdge.lock()->getParent()->getType() == Split) { + return false; + } - ptrdiff_t maxChannels = 1; - for (size_t i = 0; i < node->getParentEdges().size(); i++) { - if (node->getParentEdgeAt(0)->getDims().ndims() != node->getParentEdgeAt(i)->getDims().ndims()) - return false; - if (node->getParentEdgeAt(i)->getDims().ndims() != 2 && - node->getParentEdgeAt(i)->getDims().ndims() != 4 && - node->getParentEdgeAt(i)->getDims().ndims() != 5) + // Avoid cycle dependencies + for (auto &parentParentEdge : parentNode->getParentEdges()) { + if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent()) return false; - if (maxChannels < node->getParentEdgeAt(i)->getDims()[1]) - maxChannels = node->getParentEdgeAt(i)->getDims()[1]; } - - int simdWidth = mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common) ? 16 : - mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx2) ? 8 : 4; - if (maxChannels < simdWidth) - return false; - - return node->getChildEdges().size() == 1 && - (eltwiseLayer->_operation == EltwiseLayer::Sum || eltwiseLayer->_operation == EltwiseLayer::Prod) && - !node->isFusedWith(Quantize); - } else { - return false; } - }; - auto isSutableChildNode = [&](MKLDNNNodePtr node) { - if (!node->getCnnLayer()) + if (!childNode->getFusedWith().empty()) return false; - if (node->getType() == Quantize) { - auto* quantizeNode = dynamic_cast(node.get()); - if (quantizeNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); - return !quantizeNode->isBinarization(); - } else if (node->getType() == Activation) { - // Applicability was narrowed down in order not to affect FP32 topologies - if (node->getChildEdges().size() != 1) - return false; - if (node->getChildEdgeAt(0)->getChild()->getType() != Quantize) - return false; - - auto *activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName(); - return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_elu, eltwise_logistic, eltwise_bounded_relu, - eltwise_clamp, eltwise_swish, eltwise_hswish, eltwise_mish, - eltwise_hsigmoid}); - } - - return false; + auto eltwiseNode = dynamic_cast(parentNode.get()); + return eltwiseNode->canFuse(childNode); }; auto parent = graphNodes.begin(); @@ -1993,7 +1687,7 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) { } auto childNode = parentNode->getChildEdgeAt(0)->getChild(); - if (!isSutableChildNode(childNode)) { + if (!isSutableChildNode(parentNode, childNode)) { parent++; continue; } @@ -2009,9 +1703,70 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) { removeEdge(graph, p_edge); } - } - graph.DropNode(childNode); + graph.DropNode(childNode); + } else if (childNode->getType() == Eltwise) { + auto childs = childNode->childEdges; + auto parents = childNode->parentEdges; + + for (size_t i = 0; i < parents.size(); i++) { + auto p_edge = parents[i].lock(); + if (!p_edge) continue; + auto parent = p_edge->getParent(); + if (!parent) continue; + + if (parent == parentNode) { + for (size_t j = 0; j < childs.size(); j++) { + if (!childs[j].lock()) + continue; + auto child = childs[j].lock()->getChild(); + if (!child) + continue; + + MKLDNNEdgePtr &remEdge = p_edge; + int inNum = 0; + if (remEdge) { + inNum = remEdge->getInputNum(); + remEdge->drop(); + removeEdge(graph, remEdge); + } + remEdge = childs[j].lock(); + int outNum = 0; + if (remEdge) { + outNum = remEdge->getOutputNum(); + remEdge->drop(); + removeEdge(graph, remEdge); + } + MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum)); + auto &graphEdges = graph.GetEdges(); + graphEdges.push_back(newEdge); + parent->addEdge(newEdge); + + parent->outDims[inNum] = child->inDims[outNum]; + } + } else { + MKLDNNEdgePtr &remEdge = p_edge; + int inNum = 0; + if (remEdge) { + inNum = remEdge->getInputNum(); + remEdge->drop(); + removeEdge(graph, remEdge); + } + + auto parentEltwise = parentNode; + MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size())); + auto &graphEdges = graph.GetEdges(); + graphEdges.push_back(newEdge); + parent->addEdge(newEdge); + + parentEltwise->inDims.push_back(parent->outDims[0]); + } + } + + graph.DropNode(childNode); + } else { + graph.DropNode(childNode); + } } } @@ -2019,15 +1774,18 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) { for (MKLDNNNodePtr& node : graph.GetNodes()) { bool toDrop = false; - if (node->getType() == Power) { - PowerLayer* l = dynamic_cast(node->getCnnLayer().get()); - if (l == nullptr) - THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName(); + if (node->getType() == Eltwise) { + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode->getOpType() == PowerStatic) { + PowerLayer *l = dynamic_cast(node->getCnnLayer().get()); + if (l == nullptr) + THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName(); - if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true; + if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true; + } } - if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") { + if (node->getType() == Eltwise && node->getCnnLayer()->type == "ScaleShift") { ScaleShiftLayer* l = dynamic_cast(node->getCnnLayer().get()); if (l == nullptr) THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName(); @@ -2177,7 +1935,7 @@ void MKLDNNGraphOptimizer::DropConvertReorder(MKLDNNGraph& graph) { void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) { for (MKLDNNNodePtr& node : graph.GetNodes()) { - if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") { + if (node->getType() == Eltwise && node->getCnnLayer()->type == "ScaleShift") { ScaleShiftLayer* l = dynamic_cast(node->getCnnLayer().get()); if (l == nullptr) THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName(); @@ -2235,6 +1993,25 @@ bool MKLDNNGraphOptimizer::IsOneOf(Type type, std::vector types) { return false; } +bool MKLDNNGraphOptimizer::IsOneOf(EltwiseOpType alg, std::vector algs) { + for (auto a : algs) { + if (alg == a) { + return true; + } + } + return false; +} + +void MKLDNNGraphOptimizer::removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { + auto& edges = graph.GetEdges(); + for (auto it = edges.begin(); it != edges.end(); it++) { + if ((*it) == edge) { + edges.erase(it); + return; + } + } +} + void MKLDNNGraphOptimizer::FuseBroadcastAndEltwise(MKLDNNGraph &graph) { std::vector& graphNodes = graph.GetNodes(); @@ -2269,17 +2046,17 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) { auto& graphNodes = graph.GetNodes(); auto isSutableClampNode = [](MKLDNNNodePtr node) { - if (node->getType() != Activation) + if (node->getType() != Eltwise) return false; - auto* activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Activation node"; + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Eltwise node"; - if (activationNode->getChildEdges().size() != 1) + if (eltwiseNode->getChildEdges().size() != 1) return false; - if (activationNode->getAlgorithm() != eltwise_clamp) + if (eltwiseNode->getOpType() != Clamp) return false; return true; @@ -2297,9 +2074,9 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) { }; auto fuseClampAndQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) { - auto* activationNode = dynamic_cast(parent.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Activation node"; + auto* eltwiseNode = dynamic_cast(parent.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Eltwise node"; auto* quantizeNode = dynamic_cast(child.get()); if (quantizeNode == nullptr) @@ -2311,9 +2088,9 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) { std::vector newCropLow(cropLowData.size()); std::vector newCropHigh(cropHighData.size()); for (int i = 0; i < cropLowData.size(); i++) - newCropLow[i] = std::max(cropLowData[i], activationNode->getBeta()); + newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getBeta()); for (int i = 0; i < cropHighData.size(); i++) - newCropHigh[i] = std::min(cropHighData[i], activationNode->getAlpha()); + newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getAlpha()); quantizeNode->setCropLow(newCropLow); quantizeNode->setCropHigh(newCropHigh); @@ -2338,17 +2115,17 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) { auto& graphNodes = graph.GetNodes(); auto isSutableScaleShiftNode = [](MKLDNNNodePtr node) { - if (node->getType() != Depthwise) + if (node->getType() != Eltwise) return false; - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Depthwise node"; + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to eltwise node"; - if (depthwiseNode->getChildEdges().size() != 1) + if (eltwiseNode->getChildEdges().size() != 1) return false; - if (depthwiseNode->getAlgorithm() != depthwise_scale_shift || depthwiseNode->isBroadcast()) + if (eltwiseNode->getOpType() != MulAdd) return false; return true; @@ -2366,23 +2143,23 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) { }; auto fuseScaleShiftAndQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) { - auto* depthwiseNode = dynamic_cast(parent.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Depthwise node"; + auto* eltwiseNode = dynamic_cast(parent.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to eltwise node"; - auto depthwiseLayer = depthwiseNode->getCnnLayer(); - if (depthwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get scale shift layer " << depthwiseNode->getName(); + auto eltwiseLayer = eltwiseNode->getCnnLayer(); + if (eltwiseLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot get scale shift layer " << eltwiseNode->getName(); auto* quantizeNode = dynamic_cast(child.get()); if (quantizeNode == nullptr) THROW_IE_EXCEPTION << "Cannot cast " << child->getName() << " to Quantize node"; - Blob::Ptr scalesBlob = depthwiseLayer->blobs["weights"]; + Blob::Ptr scalesBlob = eltwiseLayer->blobs["weights"]; if (scalesBlob == nullptr) return false; - Blob::Ptr shiftsBlob = depthwiseLayer->blobs["biases"]; + Blob::Ptr shiftsBlob = eltwiseLayer->blobs["biases"]; if (shiftsBlob == nullptr) return false; @@ -2447,6 +2224,15 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) { if (!isSutableQuantizeNode(child)) continue; if (fuseScaleShiftAndQuantizeNodes(parent, child)) { + auto parentEdges = parent->parentEdges; + for (auto &parentEdge : parentEdges) { + auto p_edge = parentEdge.lock(); + if (p_edge->getParent()->getCnnLayer()->type != "Const") + continue; + + removeEdge(graph, p_edge); + } + graph.DropNode(parent); } } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h index 2feb0f2..54bdda6 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h @@ -5,6 +5,7 @@ #pragma once #include "mkldnn_graph.h" +#include "nodes/mkldnn_eltwise_node.h" #include namespace MKLDNNPlugin { @@ -18,18 +19,12 @@ public: void ApplyImplSpecificGraphOptimizations(MKLDNNGraph& graph); private: - void SLTMTransform(MKLDNNGraph& graph); void MergeConversions(MKLDNNGraph& graph); void MergeGroupConvolution(MKLDNNGraph& graph); void MergeTwoEqualScaleShifts(MKLDNNGraph& graph); - void MergeSigmoidAndMultiplyToSwish(MKLDNNGraph& graph); -#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE) void FuseConvolutionAndActivation(MKLDNNGraph &graph); void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph); -#endif -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) void FuseConvolutionAndDepthwise(MKLDNNGraph &graph); -#endif void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph); void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph); #if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE) @@ -59,6 +54,9 @@ private: void FuseClampAndQuantize(MKLDNNGraph &graph); bool IsOneOf(Type type, std::vector types); + bool IsOneOf(EltwiseOpType alg, std::vector algs); + + void removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge); }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index f459688..ff96a75 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -22,12 +22,9 @@ #include #include #include -#include -#include #include #include #include -#include #include #include #include @@ -63,23 +60,23 @@ static const InferenceEngine::details::caseless_unordered_map { "Output", Output }, { "Reorder", Reorder }, { "Convolution", Convolution }, - { "ReLU", Activation }, - { "GELU", Activation }, - { "ELU", Activation }, - { "Sigmoid", Activation }, - { "Logistic", Activation }, - { "TanH", Activation }, - { "ReLU6", Activation }, - { "Exp", Activation }, - { "Not", Activation }, - { "Activation", Activation }, - { "Clamp", Activation }, - { "Swish", Activation }, - { "HSwish", Activation }, - { "Mish", Activation }, - { "HSigmoid", Activation }, - { "ScaleShift", Depthwise }, - { "PReLU", Depthwise }, + { "ReLU", Eltwise }, + { "GELU", Eltwise }, + { "ELU", Eltwise }, + { "Sigmoid", Eltwise }, + { "Logistic", Eltwise }, + { "TanH", Eltwise }, + { "ReLU6", Eltwise }, + { "Exp", Eltwise }, + { "Not", Eltwise }, + { "Activation", Eltwise }, + { "Clamp", Eltwise }, + { "Swish", Eltwise }, + { "HSwish", Eltwise }, + { "Mish", Eltwise }, + { "HSigmoid", Eltwise }, + { "ScaleShift", Eltwise }, + { "PReLU", Eltwise }, { "Norm", Lrn }, { "LRN", Lrn }, { "Pooling", Pooling }, @@ -91,9 +88,10 @@ static const InferenceEngine::details::caseless_unordered_map { "Split", Split }, { "Slice", Split }, { "Concat", Concatenation }, - { "Power", Power }, { "Deconvolution", Deconvolution }, { "Eltwise", Eltwise }, + { "Mod", Eltwise }, + { "Power", Eltwise }, { "Crop", Crop }, { "Reshape", Reshape }, { "Tile", Tile }, diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index f5f6953..469cc7a 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -44,7 +44,6 @@ enum Type { SoftMax, Split, Concatenation, - Power, Eltwise, Gemm, Crop, @@ -118,8 +117,6 @@ static std::string NameFromType(Type type) { return "Split"; case Concatenation: return "Concatenation"; - case Power: - return "Power"; case Depthwise: return "Depthwise"; case Crop: diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp index 97f97af..7df7a9f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -145,6 +146,7 @@ static void Transformation(ICNNNetwork::Ptr& clonedNetwork, const Config& conf) pass_config->disable(); pass_config->disable(); pass_config->disable(); + pass_config->disable(); pass_config->enable(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp new file mode 100644 index 0000000..8719a48 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp @@ -0,0 +1,200 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "emitter.h" +#include + +using namespace mkldnn::impl::cpu; +using namespace mkldnn::impl; +using namespace Xbyak; + +namespace MKLDNNPlugin { + +template +constexpr bool one_of(T val, P item) { return val == item; } + +template +constexpr bool one_of(T val, P item, Args... item_others) { + return val == item || one_of(val, item_others...); +} + + +size_t jit_emitter::get_max_vecs_count() const { + return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 32 : 16; +} + +size_t jit_emitter::get_vec_length() const { + return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 64 : + one_of(host_isa_, cpu::avx2) ? 32 : 16; +} + +void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const { + if (host_isa_ == cpu::sse42) { + h->uni_vmovups(addr, Xmm(vec_idx)); + } else if (host_isa_ == cpu::avx2) { + h->uni_vmovups(addr, Ymm(vec_idx)); + } else { + h->uni_vmovups(addr, Zmm(vec_idx)); + } +} + +void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const { + if (host_isa_ == cpu::sse42) { + h->uni_vmovups(Xmm(vec_idx), addr); + } else if (host_isa_ == cpu::avx2) { + h->uni_vmovups(Ymm(vec_idx), addr); + } else { + h->uni_vmovups(Zmm(vec_idx), addr); + } +} + +size_t jit_emitter::aux_vecs_count() const { + return 0; +} + +size_t jit_emitter::aux_gprs_count() const { + // We need one gpr to load table address + return entry_map_.empty() ? 0 : 1; +} + +std::set jit_emitter::get_supported_precisions() { + return {InferenceEngine::Precision::FP32}; +} + +void jit_emitter::emitter_preamble(const std::vector &in_vec_idxs, const std::vector &pool_vec_idxs, + const std::vector &pool_gpr_idxs) { + using namespace Xbyak::util; + + for (auto idx : pool_vec_idxs) + aux_vec_idxs.push_back(idx); + + // For sse42 mask register has to be Xmm(0) + if (host_isa_ == cpu::sse42 && aux_vecs_count() > 0) { + size_t idx = 0; + assert(std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) == in_vec_idxs.end()); + if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) { + aux_vec_idxs.push_back(idx); + preserved_vec_idxs.push_back(idx); + } + + // moving mask vector at the beginning of aux vectors list to simplify further processing + for (int i = 0; i < aux_vec_idxs.size(); i++) { + if (aux_vec_idxs[i] == 0) { + size_t tmp = aux_vec_idxs[0]; + aux_vec_idxs[0] = aux_vec_idxs[i]; + aux_vec_idxs[i] = tmp; + break; + } + } + } + + for (size_t idx = 0; idx < get_max_vecs_count(); idx++) { + if (aux_vec_idxs.size() >= aux_vecs_count()) break; + + if (std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) != in_vec_idxs.end()) continue; + if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue; + if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue; + + aux_vec_idxs.push_back(idx); + preserved_vec_idxs.push_back(idx); + } + assert(aux_vec_idxs.size() >= aux_vecs_count()); + + // Same logic but to allocate gprs + for (auto idx : pool_gpr_idxs) + aux_gpr_idxs.push_back(idx); + + for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) { + size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end + + if (aux_gpr_idxs.size() >= aux_gprs_count()) break; + if (_idx == Operand::RSP) continue; + if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue; + if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue; + + aux_gpr_idxs.push_back(_idx); + preserved_gpr_idxs.push_back(_idx); + } + assert(aux_gpr_idxs.size() == aux_gprs_count()); + + if (!entry_map_.empty()) { + p_table = Reg64(aux_gpr_idxs[0]); + aux_gpr_idxs.erase(aux_gpr_idxs.begin()); + } + + for (size_t i = 0; i < preserved_gpr_idxs.size(); ++i) + h->push(Reg64(preserved_gpr_idxs[i])); + + if (preserved_vec_idxs.size()) + h->sub(h->rsp, preserved_vec_idxs.size() * get_vec_length()); + + for (size_t i = 0; i < preserved_vec_idxs.size(); ++i) { + push_vec(h->ptr[h->rsp + i * get_vec_length()], preserved_vec_idxs[i]); + } + + if (!entry_map_.empty()) + load_table_addr(); +} + + +void jit_emitter::emitter_postamble() { + using namespace Xbyak::util; + + for (size_t i = 0; i < preserved_vec_idxs.size(); ++i) + pop_vec(preserved_vec_idxs[i], h->ptr[h->rsp + i * get_vec_length()]); + + if (preserved_vec_idxs.size()) + h->add(h->rsp, preserved_vec_idxs.size() * get_vec_length()); + + for (int i = aux_gprs_count() - 1; i >= 0; --i) + h->pop(Reg64(preserved_gpr_idxs[i])); + + preserved_vec_idxs.clear(); + preserved_gpr_idxs.clear(); + + aux_vec_idxs.clear(); + aux_gpr_idxs.clear(); +} + +void jit_emitter::emit_table() { + h->align(64); + h->L(l_table); + + // Assumption: entries can be inserted with dd, so they should be 4 bytes. + assert(sizeof(table_entry_val_t) == 4); + + // Run through the map and insert values stored there + for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { + const auto &te = (*it).second; // get map entry for a given key + const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) + h->dd(te.val); + } +} + +void jit_emitter::prepare_table() { + register_table_entries(); + + // Now that we registered the entries, we set the offsets. No + // entries should be registered after this point. This allows to + // expect the same order when injecting the table entries in + // prepare_table. + size_t off = 0; + for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { + auto &te = (*it).second; + te.off = off; + off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + } +} + +void jit_emitter::emit(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + emitter_preamble(in_vec_idxs, pool_vec_idxs, pool_gpr_idxs); + + emit_impl(in_vec_idxs, out_vec_idxs, pool_vec_idxs, pool_gpr_idxs); + + emitter_postamble(); +} + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h new file mode 100644 index 0000000..53a1aef --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h @@ -0,0 +1,128 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "jit_generator.hpp" +#include "mkldnn_node.h" +#include + +namespace MKLDNNPlugin { + +class jit_emitter { +public: + jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32) + : h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) { + k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well + } + + virtual void emit(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); + virtual void emit_table(); + virtual size_t get_inputs_num() = 0; + virtual size_t aux_vecs_count() const; + static std::set get_supported_precisions(); + +protected: + virtual size_t aux_gprs_count() const; + + size_t get_max_vecs_count() const; + size_t get_vec_length() const; + + const MKLDNNNode& n; + mkldnn::impl::cpu::jit_generator* h; + mkldnn::impl::cpu::cpu_isa_t host_isa_; + InferenceEngine::Precision exec_prc_; + + Xbyak::Opmask k_mask; + + virtual void prepare_table(); + virtual void register_table_entries() {} + + void load_table_addr() { h->mov(p_table, l_table); } + + // we accept only 32bit hexadecimal table values to avoid any rounding + using table_entry_val_t = uint32_t; + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value + + struct table_entry_t { + table_entry_val_t val; + table_entry_bcast_t bcast; + }; + struct mapped_table_entry_t { + table_entry_offset_t off; + table_entry_val_t val; + table_entry_bcast_t bcast; + }; + + Xbyak::Reg64 p_table; + Xbyak::Label l_table; + + enum { + _cmp_eq_oq = mkldnn::impl::cpu::jit_generator::_cmp_eq_oq, + _cmp_neq_uq = mkldnn::impl::cpu::jit_generator::_cmp_neq_uq, + _cmp_lt_os = mkldnn::impl::cpu::jit_generator::_cmp_lt_os, + _cmp_le_os = mkldnn::impl::cpu::jit_generator::_cmp_le_os, + _cmp_ge_os = mkldnn::impl::cpu::jit_generator::_cmp_nlt_us, + _cmp_gt_os = mkldnn::impl::cpu::jit_generator::_cmp_nle_us, + }; + + virtual void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) {} + + virtual void emitter_preamble(const std::vector &in_vec_idxs, const std::vector &pool_vec_idxs, + const std::vector &pool_gpr_idxs); + virtual void emitter_postamble(); + + std::vector aux_vec_idxs; + std::vector aux_gpr_idxs; + + static constexpr int k_mask_size = 8; + + Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const { + auto off = table_off(key, key_off_val_shift); + return h->ptr[p_table + off]; + } + + using table_t = std::multimap; + using mapped_table_t = std::multimap; + + mapped_table_t entry_map_; + + void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { + mapped_table_entry_t te {0, val, broadcast}; + entry_map_.insert(std::make_pair(key, te)); + } + + void push_entries_of(const table_t &t) { + for (auto it = t.begin(); it != t.end(); it++) { + auto key = (*it).first; + auto te = (*it).second; // copy values from table + push_arg_entry_of(key, te.val, te.bcast); + } + } + +private: + std::vector preserved_vec_idxs; + std::vector preserved_gpr_idxs; + + void push_vec(const Xbyak::Address &addr, size_t vec_idx) const; + void pop_vec(size_t vec_idx, const Xbyak::Address &addr) const; + + size_t table_off(std::string& key, size_t key_off_val_shift = 0) const { + // assumption: all table entries sharing the same key also + // share their broadcast property + // TODO: enforce through data structure + const auto it = entry_map_.find(key); // search an entry for a key + assert(it != entry_map_.end()); + const auto &te = (*it).second; + const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + return te.off + key_off_val_shift * scale; + } +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp new file mode 100644 index 0000000..aa5449b --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp @@ -0,0 +1,1417 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common/emitter.h" +#include "jit_eltwise_emitters.hpp" +#include "mkldnn_eltwise_node.h" +#include "jit_uni_eltwise.hpp" +#include "legacy/ie_layers.h" + +using namespace InferenceEngine; +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::cpu; +using namespace Xbyak; + +namespace MKLDNNPlugin { + +/// ADD /// +jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_add_emitter::get_inputs_num() { return 2; } + +void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == cpu::sse42) { + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src1); + } else { + h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); + } +} + +/// MUL_ADD /// +jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_mul_add_emitter::get_inputs_num() { return 3; } + +void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src2 = Vmm(in_vec_idxs[2]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == cpu::sse42) { + h->uni_vmovups(vmm_dst, vmm_src0); + h->mulps(vmm_dst, vmm_src1); + h->addps(vmm_dst, vmm_src2); + } else { + Vmm vmm_mul0; + if (vmm_dst.getIdx() == vmm_src0.getIdx()) { + h->uni_vmovups(vmm_aux0, vmm_src0); + vmm_mul0 = vmm_aux0; + } else { + vmm_mul0 = vmm_src0; + } + + Vmm vmm_mul1; + if (vmm_dst.getIdx() == vmm_src1.getIdx()) { + h->uni_vmovups(vmm_aux0, vmm_src1); + vmm_mul1 = vmm_aux0; + } else { + vmm_mul1 = vmm_src1; + } + + if (vmm_dst.getIdx() != vmm_src2.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src2); + h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1); + } +} + +size_t jit_mul_add_emitter::aux_vecs_count() const { + return 1; +} + +/// SUB /// +jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_subtract_emitter::get_inputs_num() { return 2; } + +void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == cpu::sse42) { + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1); + } else { + h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); + } +} + + +/// MULTIPLY /// +jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_multiply_emitter::get_inputs_num() { return 2; } + +void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == cpu::sse42) { + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1); + } else { + h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); + } +} + + +/// DIVIDE /// +jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_divide_emitter::get_inputs_num() { return 2; } + +void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == cpu::sse42) { + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vdivps(vmm_dst, vmm_dst, vmm_src1); + } else { + h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1); + } +} + + +/// FLOOR_MOD /// +jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_floor_mod_emitter::get_inputs_num() { return 2; } + +void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + + if (isa == cpu::sse42) { + if (vmm_dst.getIdx() != vmm_src0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vmovups(vmm_aux0, vmm_src0); + h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); + h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down + h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + } else { + if (vmm_dst.getIdx() != vmm_src0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down + h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + } +} + +size_t jit_floor_mod_emitter::aux_vecs_count() const { + return 1; +} + +/// MOD /// +jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_mod_emitter::get_inputs_num() { return 2; } + +void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + + if (isa == cpu::sse42) { + if (vmm_dst.getIdx() != vmm_src0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vmovups(vmm_aux0, vmm_src0); + h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); + h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate + h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + } else { + if (vmm_dst.getIdx() != vmm_src0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate + h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + } +} + +size_t jit_mod_emitter::aux_vecs_count() const { + return 1; +} + +/// MAXIMUM /// +jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_maximum_emitter::get_inputs_num() { return 2; } + +void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + auto uni_vmax = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { + switch (exec_prc_) { + case Precision::FP32: h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); break; + case Precision::I32: h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); break; + default: assert(!"unsupported precision"); + } + }; + + if (isa == cpu::sse42) { + if (vmm_src0.getIdx() != vmm_dst.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + uni_vmax(vmm_dst, vmm_dst, vmm_src1); + } else { + uni_vmax(vmm_dst, vmm_src0, vmm_src1); + } +} + +std::set jit_maximum_emitter::get_supported_precisions() { + return {Precision::FP32, Precision::I32}; +} + +/// MINIMUM /// +jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_minimum_emitter::get_inputs_num() { return 2; } + +void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + auto uni_vmin = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { + switch (exec_prc_) { + case Precision::FP32: h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); break; + case Precision::I32: h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); break; + default: assert(!"unsupported precision"); + } + }; + + if (isa == cpu::sse42) { + if (vmm_src0.getIdx() != vmm_dst.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + uni_vmin(vmm_dst, vmm_dst, vmm_src1); + } else { + uni_vmin(vmm_dst, vmm_src0, vmm_src1); + } +} + +std::set jit_minimum_emitter::get_supported_precisions() { + return {Precision::FP32, Precision::I32}; +} + +/// SQUARED_DIFFERENCE /// +jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_squared_difference_emitter::get_inputs_num() { return 2; } + +void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == cpu::sse42) { + if (vmm_src0.getIdx() != vmm_dst.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); + } else { + h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); + } +} + + +/// POWER_DYNAMIC /// +jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; } + +void jit_power_dynamic_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + Xmm xmm0 = Xmm(0), xmm1 = Xmm(1); + + // caller obligation to save gprs as callee may use them + size_t gpr_size = 8; + Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, + h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); + + h->sub(h->rsp, n_gprs_to_save * gpr_size); + for (size_t i = 0; i < n_gprs_to_save; ++i) + h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); + + // caller obligation to save k-regs as callee may use them + size_t n_k_regs_to_save = 8; + if (isa == cpu::avx512_common || isa == cpu::avx512_core) { + h->sub(h->rsp, n_k_regs_to_save * k_mask_size); + for (size_t i = 0; i < n_k_regs_to_save; ++i) { + if (mayiuse(avx512_core)) + h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); + else + h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); + } + } + + // 1. Caller obligation to save vector registers as callee may use them. + // 2. Additionally save space for vmm_src, to put the answer in-place on + // this space and space for beta. + // 3. There is an implicit assumption that the host code uses the same + // `isa` as the injector. Once the assumption is wrong, `vecs_count` and + // `vlen` should be replaced with `host_isa::vlen` and + // `host_isa::vecs_count`. + h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); + for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) + h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta + + // save function address in gpr to pass in in call instruction + h->mov(h->rbp, reinterpret_cast(powf)); + + // align stack on 16-byte as ABI requires + h->mov(h->rbx, h->rsp); + h->and_(h->rbx, 0xf); + h->sub(h->rsp, h->rbx); + + // Take src, apply powf on it and replace value on a stack with dst. + for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) { + const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; + h->uni_vmovss(xmm0, source); + h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]); + h->call(h->rbp); + h->uni_vmovss(source, xmm0); + } + + h->add(h->rsp, h->rbx); + + // restore vector registers + for (size_t i = get_max_vecs_count() + 1; i >= 2; --i) + h->uni_vmovups(Vmm(i - 2), h->ptr[h->rsp + i * get_vec_length()]); + h->uni_vmovups(vmm_dst, h->ptr[h->rsp + 0 * get_vec_length()]); + h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); + + // restore k registers + if (isa == cpu::avx512_common || isa == cpu::avx512_core) { + for (int i = n_k_regs_to_save - 1; i >= 0; --i) { + if (mayiuse(avx512_core)) + h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + else + h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + } + h->add(h->rsp, n_k_regs_to_save * k_mask_size); + } + + // restore gpr registers + for (int i = n_gprs_to_save - 1; i >= 0; --i) + h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); + h->add(h->rsp, n_gprs_to_save * gpr_size); +} + + +/// EQUAL /// +jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_equal_emitter::get_inputs_num() { return 2; } + +void jit_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->movups(vmm_aux0, vmm_src0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_dst, vmm_dst); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + } +} + +void jit_equal_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_equal_emitter::aux_vecs_count() const { + return 2; +} + +/// NOT_EQUAL /// +jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_not_equal_emitter::get_inputs_num() { return 2; } + +void jit_not_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->movups(vmm_aux0, vmm_src0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->movups(vmm_dst, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vmovups(vmm_dst, table_val("one")); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq); + h->uni_vmovups(vmm_dst, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("zero")); + } +} + +void jit_not_equal_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_not_equal_emitter::aux_vecs_count() const { + return 2; +} + +/// GREATER /// +jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_greater_emitter::get_inputs_num() { return 2; } + +void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->movups(vmm_aux0, vmm_src0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os); + h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_dst, vmm_dst); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_gt_os); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + } +} + +void jit_greater_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_greater_emitter::aux_vecs_count() const { + return 2; +} + +/// GREATER_EQUAL /// +jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_greater_equal_emitter::get_inputs_num() { return 2; } + +void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->movups(vmm_aux0, vmm_src0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os); + h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_dst, vmm_dst); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_ge_os); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + } +} + +void jit_greater_equal_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_greater_equal_emitter::aux_vecs_count() const { + return 2; +} + +/// LESS /// +jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_less_emitter::get_inputs_num() { return 2; } + +void jit_less_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_less_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->movups(vmm_aux0, vmm_src0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os); + h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_dst, vmm_dst); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_lt_os); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + } +} + +void jit_less_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_less_emitter::aux_vecs_count() const { + return 2; +} + +/// LESS_EQUAL /// +jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_less_equal_emitter::get_inputs_num() { return 2; } + +void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->movups(vmm_aux0, vmm_src0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os); + h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_dst, vmm_dst); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_le_os); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + } +} + +void jit_less_equal_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_less_equal_emitter::aux_vecs_count() const { + return 2; +} + +/// LOGICAL_AND /// +jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_logical_and_emitter::get_inputs_num() { return 2; } + +void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); + + if (isa == cpu::sse42) { + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->movups(vmm_dst, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux1); + + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->movups(vmm_aux2, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_aux2, vmm_aux1); + + h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2); + } else if (isa == cpu::avx2) { + h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->uni_vmovups(vmm_dst, table_val("one")); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + + h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero")); + h->uni_vmovups(vmm_aux0, table_val("one")); + h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1); + + h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_aux0, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero")); + + h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq); + h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero")); + + h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0); + } +} + +void jit_logical_and_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_logical_and_emitter::aux_vecs_count() const { + return 3; +} + + +/// LOGICAL_OR /// +jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_logical_or_emitter::get_inputs_num() { return 2; } + +void jit_logical_or_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); + + if (isa == cpu::sse42) { + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->movups(vmm_dst, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux1); + + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->movups(vmm_aux2, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_aux2, vmm_aux1); + + h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2); + } else if (isa == cpu::avx2) { + h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->uni_vmovups(vmm_dst, table_val("one")); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + + h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero")); + h->uni_vmovups(vmm_aux0, table_val("one")); + h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1); + + h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_aux0, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero")); + + h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq); + h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero")); + + h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0); + } +} + +void jit_logical_or_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_logical_or_emitter::aux_vecs_count() const { + return 3; +} + +/// LOGICAL_XOR /// +jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_logical_xor_emitter::get_inputs_num() { return 2; } + +void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); + + if (isa == cpu::sse42) { + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->movups(vmm_dst, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux1); + + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->movups(vmm_aux2, table_val("one")); + h->pxor(vmm_aux1, vmm_aux1); + h->blendvps(vmm_aux2, vmm_aux1); + + h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2); + } else if (isa == cpu::avx2) { + h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->uni_vmovups(vmm_dst, table_val("one")); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + + h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero")); + h->uni_vmovups(vmm_aux0, table_val("one")); + h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1); + + h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_aux0, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero")); + + h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq); + h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero")); + + h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0); + } +} + +void jit_logical_xor_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_logical_xor_emitter::aux_vecs_count() const { + return 3; +} + +/// LOGICAL_NOT /// +jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_logical_not_emitter::get_inputs_num() { return 1; } + +void jit_logical_not_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_logical_not_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_dst, vmm_dst); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + } else { + h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_dst, table_val("zero")); + h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + } +} + +void jit_logical_not_emitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); +} + +size_t jit_logical_not_emitter::aux_vecs_count() const { + return 2; +} + +/// POWER_STATIC /// +jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_power_static_emitter::get_inputs_num() { return 1; } + +void jit_power_static_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + + auto *powerLayer = dynamic_cast(n.getCnnLayer().get()); + if (powerLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert power layer."; + + float power = powerLayer->power; + float scale = powerLayer->scale; + float shift = powerLayer->offset; + + Xmm xmm0 = Xmm(0), xmm1 = Xmm(1); + + if (scale != 1.f || shift != 0.f) { + if (isa == cpu::sse42) { + h->uni_vmovups(vmm_aux0, table_val("scale")); + h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0); + h->uni_vmovups(vmm_dst, table_val("shift")); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_aux0); + } else { + if (vmm_dst.getIdx() != vmm_src0.getIdx()) { + h->uni_vmovups(vmm_dst, table_val("shift")); + h->uni_vfmadd231ps(vmm_dst, vmm_src0, table_val("scale")); + } else { + h->uni_vmovups(vmm_aux0, table_val("shift")); + h->uni_vfmadd231ps(vmm_aux0, vmm_src0, table_val("scale")); + h->uni_vmovups(vmm_dst, vmm_aux0); + } + } + } else { + if (vmm_dst.getIdx() != vmm_src0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src0); + } + + if (power == 1.f) { + } else if (power == 0.5f || power == -0.5f) { + h->uni_vsqrtps(vmm_dst, vmm_dst); + + if (power < 0.f) { + h->uni_vmovups(vmm_aux0, table_val("one")); + if (isa == cpu::sse42) { + h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst); + h->uni_vmovups(vmm_dst, vmm_aux0); + } else { + h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst); + } + } + } else if (std::floor(power) == power && power != 0) { + int ipower = std::abs(static_cast(power)); + h->uni_vmovups(vmm_aux0, vmm_dst); + for (int i = 1; i < ipower; i++) { + h->uni_vmulps(vmm_dst, vmm_dst, vmm_aux0); + } + + if (power < 0.f) { + h->uni_vmovups(vmm_aux0, table_val("one")); + if (isa == cpu::sse42) { + h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst); + h->uni_vmovups(vmm_dst, vmm_aux0); + } else { + h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst); + } + } + } else { + h->uni_vmovups(vmm_aux0, table_val("power")); + + // caller obligation to save gprs as callee may use them + size_t gpr_size = 8; + Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, + h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); + + h->sub(h->rsp, n_gprs_to_save * gpr_size); + for (size_t i = 0; i < n_gprs_to_save; ++i) + h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); + + // caller obligation to save k-regs as callee may use them + size_t n_k_regs_to_save = 8; + if (isa == cpu::avx512_common || isa == cpu::avx512_core) { + h->sub(h->rsp, n_k_regs_to_save * k_mask_size); + for (size_t i = 0; i < n_k_regs_to_save; ++i) { + if (mayiuse(avx512_core)) + h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); + else + h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); + } + } + + // 1. Caller obligation to save vector registers as callee may use them. + // 2. Additionally save space for vmm_src, to put the answer in-place on + // this space and space for beta. + // 3. There is an implicit assumption that the host code uses the same + // `isa` as the injector. Once the assumption is wrong, `vecs_count` and + // `vlen` should be replaced with `host_isa::vlen` and + // `host_isa::vecs_count`. + h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); + for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) + h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta + + // save function address in gpr to pass in in call instruction + h->mov(h->rbp, reinterpret_cast(powf)); + + // align stack on 16-byte as ABI requires + h->mov(h->rbx, h->rsp); + h->and_(h->rbx, 0xf); + h->sub(h->rsp, h->rbx); + + // Take src, apply powf on it and replace value on a stack with dst. + for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) { + const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; + h->uni_vmovss(xmm0, source); + h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]); + h->call(h->rbp); + h->uni_vmovss(source, xmm0); + } + + h->add(h->rsp, h->rbx); + + // restore vector registers + for (size_t i = get_max_vecs_count() + 1; i >= 2; --i) + h->uni_vmovups(Vmm(i - 2), h->ptr[h->rsp + i * get_vec_length()]); + h->uni_vmovups(vmm_dst, h->ptr[h->rsp + 0 * get_vec_length()]); + h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); + + // restore k registers + if (isa == cpu::avx512_common || isa == cpu::avx512_core) { + for (int i = n_k_regs_to_save - 1; i >= 0; --i) { + if (mayiuse(avx512_core)) + h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + else + h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + } + h->add(h->rsp, n_k_regs_to_save * k_mask_size); + } + + // restore gpr registers + for (int i = n_gprs_to_save - 1; i >= 0; --i) + h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); + h->add(h->rsp, n_gprs_to_save * gpr_size); + } +} + +void jit_power_static_emitter::register_table_entries() { + auto *powerLayer = dynamic_cast(n.getCnnLayer().get()); + if (powerLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert power layer."; + + float power_ = powerLayer->power; + float scale_ = powerLayer->scale; + float shift_ = powerLayer->offset; + + push_arg_entry_of("power", float2int(power_), true); + push_arg_entry_of("scale", float2int(scale_), true); + push_arg_entry_of("shift", float2int(shift_), true); + push_arg_entry_of("one", float2int(1.f), true); +} + +size_t jit_power_static_emitter::aux_vecs_count() const { + return 1; +} + +/// PRELU /// +jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +size_t jit_prelu_emitter::get_inputs_num() { return 2; } + +void jit_prelu_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + + if (isa == cpu::sse42) { + h->pxor(vmm_aux0, vmm_aux0); + h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os); + h->movups(vmm_aux1, vmm_src1); + h->mulps(vmm_aux1, vmm_src0); + if (vmm_src0.getIdx() != vmm_dst.getIdx()) + h->movups(vmm_dst, vmm_src0); + h->blendvps(vmm_dst, vmm_aux1); + } else if (isa == cpu::avx2) { + h->vmulps(vmm_aux0, vmm_src0, vmm_src1); + h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1); + h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1); + h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1); + } else if (isa == cpu::avx512_common) { + h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0); + if (vmm_src0.getIdx() != vmm_dst.getIdx()) + h->vmovups(vmm_dst, vmm_src0); + h->vcmpps(k_mask, vmm_src0, vmm_aux0, _cmp_lt_os); + h->vmulps(vmm_dst | k_mask, vmm_src0, vmm_src1); + } +} + +size_t jit_prelu_emitter::aux_vecs_count() const { + return 2; +} + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp new file mode 100644 index 0000000..baa3fd8 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp @@ -0,0 +1,417 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "common/emitter.h" +#include "jit_generator.hpp" +#include "mkldnn_node.h" + +namespace MKLDNNPlugin { + +class jit_add_emitter : public jit_emitter { +public: + jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +class jit_mul_add_emitter : public jit_emitter { +public: + jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + size_t aux_vecs_count() const override; +}; + + +class jit_subtract_emitter : public jit_emitter { +public: + jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_multiply_emitter : public jit_emitter { +public: + jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_divide_emitter : public jit_emitter { +public: + jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_floor_mod_emitter : public jit_emitter { +public: + jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + size_t aux_vecs_count() const override; +}; + + +class jit_mod_emitter : public jit_emitter { +public: + jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + size_t aux_vecs_count() const override; +}; + + +class jit_maximum_emitter : public jit_emitter { +public: + jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + static std::set get_supported_precisions(); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_minimum_emitter : public jit_emitter { +public: + jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + static std::set get_supported_precisions(); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_squared_difference_emitter : public jit_emitter { +public: + jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_power_dynamic_emitter : public jit_emitter { +public: + jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_equal_emitter : public jit_emitter { +public: + jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_not_equal_emitter : public jit_emitter { +public: + jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_greater_emitter : public jit_emitter { +public: + jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_greater_equal_emitter : public jit_emitter { +public: + jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_less_emitter : public jit_emitter { +public: + jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_less_equal_emitter : public jit_emitter { +public: + jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_logical_and_emitter : public jit_emitter { +public: + jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_logical_or_emitter : public jit_emitter { +public: + jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + + +class jit_logical_xor_emitter : public jit_emitter { +public: + jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + +class jit_logical_not_emitter : public jit_emitter { +public: + jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + +class jit_power_static_emitter : public jit_emitter { +public: + jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; + size_t aux_vecs_count() const override; +}; + +class jit_prelu_emitter : public jit_emitter { +public: + jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + size_t aux_vecs_count() const override; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp new file mode 100644 index 0000000..9be8fd9 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common/emitter.h" +#include "jit_mkldnn_emitters.hpp" +#include "mkldnn_eltwise_node.h" +#include "legacy/ie_layers.h" + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::cpu; +using namespace Xbyak; + +namespace MKLDNNPlugin { + +jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, InferenceEngine::Precision exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) { + auto& eltwiseNode = dynamic_cast(n); + + auto alg = static_cast(eltwiseNode.getAlgorithm()); + + if (host_isa_ == cpu::sse42) { + eltwise_injector_sse42 = std::make_shared>( + host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta()); + } else if (host_isa_ == cpu::avx2) { + eltwise_injector_avx2 = std::make_shared>( + host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta()); + } else if (host_isa_ == cpu::avx512_common) { + eltwise_injector_avx512_common = std::make_shared>( + host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta()); + } else { + assert(!"unsupported isa"); + } +} + +size_t jit_mkldnn_emitter::get_inputs_num() { return 1; } + +void jit_mkldnn_emitter::emit(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + if (host_isa_ == cpu::sse42) { + if (out_vec_idxs[0] != in_vec_idxs[0]) + h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0])); + eltwise_injector_sse42->compute_vector(out_vec_idxs[0]); + } else if (host_isa_ == cpu::avx2) { + if (out_vec_idxs[0] != in_vec_idxs[0]) + h->uni_vmovups(Ymm(out_vec_idxs[0]), Ymm(in_vec_idxs[0])); + eltwise_injector_avx2->compute_vector(out_vec_idxs[0]); + } else if (host_isa_ == cpu::avx512_common) { + if (out_vec_idxs[0] != in_vec_idxs[0]) + h->uni_vmovups(Zmm(out_vec_idxs[0]), Zmm(in_vec_idxs[0])); + eltwise_injector_avx512_common->compute_vector(out_vec_idxs[0]); + } else { + assert(!"unsupported isa"); + } +} + +void jit_mkldnn_emitter::emit_table() { + if (host_isa_ == cpu::sse42) { + eltwise_injector_sse42->prepare_table(); + } else if (host_isa_ == cpu::avx2) { + eltwise_injector_avx2->prepare_table(); + } else if (host_isa_ == cpu::avx512_common) { + eltwise_injector_avx512_common->prepare_table(); + } else { + assert(!"unsupported isa"); + } +} + + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp new file mode 100644 index 0000000..cfd4039 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "common/emitter.h" +#include "jit_generator.hpp" +#include "mkldnn_node.h" +#include "jit_uni_eltwise.hpp" + +namespace MKLDNNPlugin { + +class jit_mkldnn_emitter : public jit_emitter { +public: + jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() override; + + void emit(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) override; + + void emit_table() override; + +private: + std::shared_ptr> eltwise_injector_sse42; + std::shared_ptr> eltwise_injector_avx2; + std::shared_ptr> eltwise_injector_avx512_common; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp deleted file mode 100644 index 144f8d9..0000000 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "mkldnn_activation_node.h" -#include "desc_iterator.hpp" -#include -#include -#include -#include - -using namespace mkldnn; -using namespace MKLDNNPlugin; -using namespace InferenceEngine; -using namespace InferenceEngine::details; - -// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu -caseless_map> MKLDNNActivationNode::initializers = { - {"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f); - beta = 0.0f; - algorithm = eltwise_relu; - }}, - {"gelu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_gelu; - }}, - {"elu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("alpha", 1.0f); - beta = 0.0f; - algorithm = eltwise_elu; - }}, - {"tanh", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_tanh; - }}, - {"logistic", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_logistic; - }}, - {"square", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_square; - }}, - {"abs", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_abs; - }}, - {"sqrt", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_sqrt; - }}, - {"linear", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("alpha", 1.0f); - beta = activationLayer->GetParamAsFloat("beta", 0.0f); - algorithm = eltwise_linear; - }}, - {"bounded_relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("alpha", 0.0f); - beta = 0.0f; - algorithm = eltwise_bounded_relu; - }}, - {"soft_relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_soft_relu; - }}, - {"relu6", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("n", 6.0f); - beta = 0.0f; - algorithm = eltwise_bounded_relu; - }}, - {"clamp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("max", 1.0f); - beta = activationLayer->GetParamAsFloat("min", 0.0f); - algorithm = eltwise_clamp; - }}, - {"exp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_exp; - }}, - {"not", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_not; - }}, - {"swish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = activationLayer->GetParamAsFloat("alpha", 1.0f); - beta = 0.0f; - algorithm = eltwise_swish; - }}, - {"hswish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_hswish; - }}, - {"mish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_mish; - }}, - {"hsigmoid", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { - alpha = 0.0f; - beta = 0.0f; - algorithm = eltwise_hsigmoid; - }}, -}; - -MKLDNNActivationNode::MKLDNNActivationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, - MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(layer, eng, cache) { - GenericLayer* activationLayer = getCnnLayer().get(); - if (activationLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get CNNLayer."; - - std::string type = activationLayer->type; - CaselessEq comparator; - if (comparator(type, "activation")) - type = activationLayer->GetParamAsString("type"); - if (comparator(type, "sigmoid")) - type = "logistic"; - - if (initializers.find(type) != initializers.end()) - initializers[type](activationLayer, algorithm, alpha, beta); -} - -void MKLDNNActivationNode::getSupportedDescriptors() { - if (!descs.empty()) - return; - - if (getParentEdges().size() != 1) - THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); - if (!getChildEdges().size()) - THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); - - auto parentOutDims = getParentEdgeAt(0)->getDims(); - - InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - - // FIXME: MKLDNN doesn't support not inputs with number of dimensions less than 4 for activation - while (parentOutDims.ndims() < 4) - parentOutDims.push_back(1); - for (auto format : getAvailableFormatsForDims(parentOutDims)) { - MKLDNNMemoryDesc in_candidate(parentOutDims, MKLDNNExtensionUtils::IEPrecisionToDataType(precision), format); - createDescriptor({in_candidate}, {}); - } -} - -void MKLDNNActivationNode::createPrimitive() { - if (prim) - return; - - auto prim_desc = createPrimitiveDescriptor(); - - prim.reset(new eltwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), - getChildEdgeAt(0)->getMemory().GetPrimitive())); -} - -bool MKLDNNActivationNode::created() const { - return getType() == Activation; -} - -void MKLDNNActivationNode::createDescriptor(const std::vector &inputDesc, - const std::vector &outputDesc) { - MKLDNNMemoryDesc inDesc(inputDesc[0]); - MKLDNNDescriptor desc(std::shared_ptr( - new eltwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), inDesc, getAlpha(), getBeta()))); - descs.push_back(desc); -} - -void MKLDNNActivationNode::initOptimalPrimitiveDescriptor() { - auto config = getSelectedPrimitiveDescriptor()->getConfig(); - if (isInitConfig(config)) - return; - - if (config.inConfs.size() != 1 || config.outConfs.size() != 1 || - (!isUninitTensorDesc(config.inConfs[0].desc) && - !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc)) - THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!"; - - if (!isUninitTensorDesc(config.inConfs[0].desc)) { - config.outConfs[0].desc = config.inConfs[0].desc; - } else if (!isUninitTensorDesc(config.outConfs[0].desc)) { - config.inConfs[0].desc = config.outConfs[0].desc; - } else { - config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0); - } - - initDescriptor(config); -} - -MKLDNNMemoryDesc MKLDNNActivationNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) { - InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc()); - - auto parentOutDims = getParentEdgeAt(idx)->getDims().ToSizeVector(); - - SizeVector blocked_dims, order, dimOffsets, strides; - size_t offset = desc.getBlockingDesc().getOffsetPadding(); - - for (size_t i = 0; i < desc.getBlockingDesc().getStrides().size(); i++) { - if (desc.getBlockingDesc().getOrder()[i] >= parentOutDims.size()) - continue; - - blocked_dims.push_back(desc.getBlockingDesc().getBlockDims()[i]); - order.push_back(desc.getBlockingDesc().getOrder()[i]); - dimOffsets.push_back(desc.getBlockingDesc().getOffsetPaddingToData()[i]); - strides.push_back(desc.getBlockingDesc().getStrides()[i]); - } - if (desc.getLayout() == InferenceEngine::Layout::ANY) - return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(), - parentOutDims, - desc.getLayout())); - else - return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(), - parentOutDims, - {blocked_dims, order, offset, dimOffsets, strides})); -} - -MKLDNNMemoryDesc MKLDNNActivationNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) { - InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc()); - - auto childInDims = getChildEdgeAt(idx)->getDims().ToSizeVector(); - - SizeVector blocked_dims, order, dimOffsets, strides; - size_t offset = desc.getBlockingDesc().getOffsetPadding(); - - for (size_t i = 0; i < desc.getBlockingDesc().getStrides().size(); i++) { - if (desc.getBlockingDesc().getOrder()[i] >= childInDims.size()) - continue; - - blocked_dims.push_back(desc.getBlockingDesc().getBlockDims()[i]); - order.push_back(desc.getBlockingDesc().getOrder()[i]); - dimOffsets.push_back(desc.getBlockingDesc().getOffsetPaddingToData()[i]); - strides.push_back(desc.getBlockingDesc().getStrides()[i]); - } - if (desc.getLayout() == InferenceEngine::Layout::ANY) - return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(), - childInDims, - desc.getLayout())); - else - return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(), - childInDims, - {blocked_dims, order, offset, dimOffsets, strides})); -} - -REG_MKLDNN_PRIM_FOR(MKLDNNActivationNode, Activation); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h deleted file mode 100644 index 997d4a8..0000000 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include -#include "caseless.hpp" -#include -#include -#include - -namespace MKLDNNPlugin { - -class MKLDNNActivationNode : public MKLDNNNode { -public: - MKLDNNActivationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); - ~MKLDNNActivationNode() override = default; - - void getSupportedDescriptors() override; - void initOptimalPrimitiveDescriptor() override; - void createDescriptor(const std::vector& inputDesc, - const std::vector& outputDesc) override; - void createPrimitive() override; - bool created() const override; - - mkldnn::algorithm getAlgorithm() const { return algorithm; } - float getAlpha() const { return alpha; } - float getBeta() const { return beta; } - - MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override; - MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override; - -private: - float alpha = 0.0f; - float beta = 0.0f; - static InferenceEngine::details::caseless_map> initializers; - mkldnn::algorithm algorithm = mkldnn::algorithm::eltwise_relu; -}; - -} // namespace MKLDNNPlugin - diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp index c632f82..09a49ba 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp @@ -3,7 +3,6 @@ // #include "mkldnn_batchnorm_node.h" -#include "mkldnn_depthwise_node.h" #include #include "common/cpu_memcpy.h" diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h index 9d46ef8..9237bdb 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h @@ -25,7 +25,7 @@ public: const std::vector& outputDesc) override; void createPrimitive() override; bool created() const override; - bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Depthwise + bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Eltwise && fusedWith[0]->getCnnLayer()->type == "ScaleShift";} MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp index e92a898..d309be0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp @@ -5,10 +5,8 @@ #include "mkldnn_bin_conv_node.h" #include "mkldnn_reorder_node.h" #include "mkldnn_input_node.h" -#include "mkldnn_activation_node.h" -#include "desc_iterator.hpp" #include "mkldnn_eltwise_node.h" -#include "mkldnn_depthwise_node.h" +#include "desc_iterator.hpp" #include "mkldnn_quantize_node.h" #include "mkldnn_conv_node.h" #include @@ -116,7 +114,6 @@ void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() { paddingR[i] = (dst - calc_dst) * stride[i]; } - withSum = isFusedWith(Eltwise); withDWConv = isFusedWith(Convolution); withBinarization = isFusedWith(Quantize); for (auto &node : fusedWith) { @@ -138,12 +135,19 @@ void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() { #endif } - int expectedInputEdgesNum = baseInputsNumber + isFusedWith(Eltwise); + withSum = false; + int expectedInputEdgesNum = baseInputsNumber; for (int i = 0; i < fusedWith.size(); i++) { auto *convolutionNode = dynamic_cast(fusedWith[i].get()); if (convolutionNode) { expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1; } + + auto *eltwiseNode = dynamic_cast(fusedWith[i].get()); + if (eltwiseNode && eltwiseNode->isSum()) { + withSum = true; + expectedInputEdgesNum++; + } } if (getParentEdges().size() != expectedInputEdgesNum) @@ -164,88 +168,13 @@ void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool for (auto &node : fusedWith) { #if defined (COMPILED_CPU_MKLDNN_ELTWISE_NODE) auto* eltwiseNode = dynamic_cast(node.get()); - if (eltwiseNode) { - if (eltwiseNode->getCnnLayer()->precision == Precision::I8) { - auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale"); - if (it != eltwiseNode->getCnnLayer()->blobs.end()) { - // currently there is the only one scale while we need scale by channel :( - ops.append_sum(it->second->buffer().as()[0]); - } - } else { - ops.append_sum(1.0); - } + if (eltwiseNode && eltwiseNode->isSum()) { + ops.append_sum(1.0); continue; } -#endif - -#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE) - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), - activationNode->getBeta()); - continue; - } -#endif - -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); - - if (initWeights) { - MKLDNNDims depthwiseDims({static_cast(rnd_up(biasesDims[0], 16))}); - - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - PostOpsIntBlobMemory[blob_idx]->FillZero(); - - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_weights->buffer(), - depthwiseLayer->_weights->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; - } - } - - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->FillZero(); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_biases->buffer(), - depthwiseLayer->_biases->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; - } - } - - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); - - blob_idx += 2; - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - nullptr); - - blob_idx += 1; - } - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - nullptr, - nullptr); - } + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); continue; } #endif diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp index c639fbf..be5fb61 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp @@ -20,6 +20,7 @@ #include "mkldnn_conv_node.h" #include "mkldnn_quantize_node.h" #include "mkldnn_pooling_node.h" +#include "mkldnn_eltwise_node.h" #include #include "common/cpu_memcpy.h" @@ -93,12 +94,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { MKLDNNDims dstDims = getChildEdgeAt(0)->getDims(); InferenceEngine::LayerConfig config; config.dynBatchSupport = true; - bool hasEltwise = false; for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); - if (parentEdge->getParent()->getType() == Eltwise) - hasEltwise = true; InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = -1; @@ -117,7 +115,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { config.outConfs.resize(1); config.outConfs[0].inPlace = -1; config.outConfs[0].constant = false; - if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1 || hasEltwise) { + if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) { auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format::nc : dims.ndims() == 4 ? memory::format::nhwc : memory::format::ndhwc @@ -155,7 +153,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { } } - if (axis != 1 || hasEltwise) + if (axis != 1) return; auto numOfDim = static_cast(dstDims.ndims()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp index 83d4862..ff40438 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp @@ -5,10 +5,8 @@ #include "mkldnn_conv_node.h" #include "mkldnn_reorder_node.h" #include "mkldnn_input_node.h" -#include "mkldnn_activation_node.h" #include "desc_iterator.hpp" #include "mkldnn_eltwise_node.h" -#include "mkldnn_depthwise_node.h" #include "mkldnn_quantize_node.h" #include "mkldnn_pooling_node.h" #include "mkldnn_concat_node.h" @@ -110,6 +108,21 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { if (convLayer == nullptr) THROW_IE_EXCEPTION << "Cannot convert convolution layer."; + withSum = false; + int expectedInputEdgesNum = baseInputsNumber; + for (int i = 0; i < fusedWith.size(); i++) { + auto *convolutionNode = dynamic_cast(fusedWith[i].get()); + if (convolutionNode) { + expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1; + } + + auto *eltwiseNode = dynamic_cast(fusedWith[i].get()); + if (eltwiseNode && eltwiseNode->isSum()) { + withSum = true; + expectedInputEdgesNum++; + } + } + auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision()); if (!inputZeroPoints.empty()) inputDataType = memory::u8; @@ -127,10 +140,10 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { // We need to make sure that convolution output and second input of fused Eltwise operation // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32. - if (outputDataType != memory::f32 && outputDataType != memory::bf16 && isFusedWith(Eltwise)) { + if (outputDataType != memory::f32 && outputDataType != memory::bf16 && withSum) { for (int i = 0; i < fusedWith.size(); i++) { auto *eltwiseNode = dynamic_cast(fusedWith[i].get()); - if (eltwiseNode) { + if (eltwiseNode && eltwiseNode->isSum()) { eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i); if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) { eltwisePrecision = Precision::FP32; @@ -142,14 +155,6 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { } } - int expectedInputEdgesNum = baseInputsNumber + isFusedWith(Eltwise); - for (int i = 0; i < fusedWith.size(); i++) { - auto *convolutionNode = dynamic_cast(fusedWith[i].get()); - if (convolutionNode) { - expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1; - } - } - if (getParentEdges().size() != expectedInputEdgesNum) THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); if (getChildEdges().empty()) @@ -232,7 +237,6 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { MKLDNNDims weightsDims = MKLDNNDims(weightDims); - withSum = isFusedWith(Eltwise); withDWConv = isFusedWith(Convolution); for (int i = 0; i < fusedWith.size(); i++) { @@ -287,7 +291,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { eltwisePrecision = Precision::FP32; for (int i = 0; i < fusedWith.size(); i++) { auto *eltwiseNode = dynamic_cast(fusedWith[i].get()); - if (eltwiseNode) { + if (eltwiseNode && eltwiseNode->isSum()) { eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i); // TODO(amalyshe): there might be situation when convolution can be executed in BF16, // output is required in FP32 but eltwise inplace tensor would be in BF16 @@ -364,93 +368,16 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe if (node->getType() == Split || node->getType() == Concatenation) continue; -#if defined (COMPILED_CPU_MKLDNN_ELTWISE_NODE) auto* eltwiseNode = dynamic_cast(node.get()); - if (eltwiseNode) { - if (eltwiseNode->getCnnLayer()->precision == Precision::I8) { - auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale"); - if (it != eltwiseNode->getCnnLayer()->blobs.end()) { - // currently there is the only one scale while we need scale by channel :( - ops.append_sum(it->second->buffer().as()[0], mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision))); - } - } else { + if (eltwiseNode && eltwiseNode->isSum()) { ops.append_sum(1.0, mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision))); - } - continue; } -#endif - -#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE) - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), - activationNode->getBeta()); - continue; - } -#endif - -#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE) - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); - - if (initWeights) { - MKLDNNDims depthwiseDims({static_cast(rnd_up(biasesDims[0], 16))}); - - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - PostOpsIntBlobMemory[blob_idx]->FillZero(); - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_weights->buffer(), - depthwiseLayer->_weights->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; - } - } - - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->FillZero(); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_biases->buffer(), - depthwiseLayer->_biases->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; - } - } - - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); - - blob_idx += 2; - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - nullptr); - - blob_idx += 1; - } - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - nullptr, - nullptr); - } + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); continue; } -#endif auto* quantizeNode = dynamic_cast(node.get()); if (quantizeNode) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp index 936df9a..f09611b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp @@ -5,10 +5,8 @@ #include "mkldnn_def_conv_node.h" #include "mkldnn_reorder_node.h" #include "mkldnn_input_node.h" -#include "mkldnn_activation_node.h" #include "desc_iterator.hpp" #include "mkldnn_eltwise_node.h" -#include "mkldnn_depthwise_node.h" #include #include #include diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp deleted file mode 100644 index 486cc96..0000000 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "mkldnn_depthwise_node.h" -#include "desc_iterator.hpp" -#include -#include -#include -#include -#include -#include "caseless.hpp" - -using namespace mkldnn; -using namespace MKLDNNPlugin; -using namespace InferenceEngine; -using namespace InferenceEngine::details; - -MKLDNNDepthwiseNode::MKLDNNDepthwiseNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) - : MKLDNNNode(layer, eng, cache) { - internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc { - return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc()); - }); - internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc { - if (!isWithBiases()) - return MKLDNNMemoryDesc(); - return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc()); - }); -} - -void MKLDNNDepthwiseNode::getSupportedDescriptors() { - InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; - auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - - auto parentOutDims = getParentEdgeAt(0)->getDims(); - - if (getParentEdges().size() != 1) - THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect number of inputs!"; - if (parentOutDims != getChildEdgeAt(0)->getDims()) - THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect dimensions!"; - - auto size = static_cast(parentOutDims.ndims() == 1 ? parentOutDims[0] : parentOutDims[1]); - SizeVector weightDims = { size }; - MKLDNNDims blocked_weightDims(weightDims); - - auto * wLayer = dynamic_cast(getCnnLayer().get()); - if (wLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << "."; - - InferenceEngine::Blob::Ptr blb = wLayer->_weights; - if (blb) - realWeightSize = blb->size(); - internalBlobs.push_back(createInternalBlob(weightDims, true)); - if (isWithBiases()) { - InferenceEngine::Blob::Ptr blb = wLayer->_biases; - if (blb) - realBiasSize = blb->size(); - internalBlobs.push_back(createInternalBlob(weightDims, false)); - } - - for (auto format : getAvailableFormatsForDims(parentOutDims)) { - MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format}; - createDescriptor({in_candidate}, {}); - } -} - -void MKLDNNDepthwiseNode::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) - return; - - auto parentOutDims = getParentEdgeAt(0)->getDims(); - if (parentOutDims.ndims() <= 5) { - MKLDNNNode::initSupportedPrimitiveDescriptors(); - } else { - createSpecificDescriptor5D(); - if (specificDesc5DPtr == nullptr) - THROW_IE_EXCEPTION << "Cannot create specific MKLDNNDescriptor for depthwise node " << getName(); - const auto& desc = *specificDesc5DPtr; - auto itpd = desc.createPrimitiveDescriptorIterator(getEngine()); - while (itpd.is_not_end()) { - InferenceEngine::LayerConfig config; - config.dynBatchSupport = true; - for (size_t i = 0; i < descInputNumbers(desc); i++) { - InferenceEngine::DataConfig dataConfig; - dataConfig.inPlace = -1; - dataConfig.constant = false; - dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY)); - config.inConfs.push_back(dataConfig); - } - - std::vector outFormats; - for (size_t i = 0; i < descOutputNumbers(desc); i++) { - InferenceEngine::DataConfig dataConfig; - dataConfig.inPlace = canBeInPlace() ? 0 : -1; - dataConfig.constant = false; - dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY)); - config.outConfs.push_back(dataConfig); - - auto primDesc = itpd.fetch(); - auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0); - if (dstPrimDesc) { - outFormats.emplace_back(static_cast(itpd.dst_primitive_desc().desc().data.format)); - } else { - // This path is needed to correctly handle Deconvolution node - auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0); - if (diffSrcPrimDesc) { - outFormats.emplace_back(static_cast(itpd.diff_src_primitive_desc().desc().data.format)); - } - } - } - impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); - - supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats); - itpd++; - } - } -} - -void MKLDNNDepthwiseNode::createPrimitive() { - if (prim) - return; - - auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); - auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); - if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) - THROW_IE_EXCEPTION << "Destination memory didn't allocate."; - if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) - THROW_IE_EXCEPTION << "Input memory didn't allocate."; - if (getSelectedPrimitiveDescriptor() == nullptr) - THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set."; - - auto createRightPrimitiveDescriptor = [&]() -> depthwise_forward::primitive_desc { - auto parentOutDims = getParentEdgeAt(0)->getDims(); - if (parentOutDims.ndims() <= 5) { - return createPrimitiveDescriptor(); - } else { - const PrimitiveDescInfo *selected_pd = getSelectedPrimitiveDescriptor(); - auto& desc = *specificDesc5DPtr; - auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), mkldnn::primitive_attr()); - - while (itpd.is_not_end()) { - impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); - if (impl_type == getSelectedPrimitiveDescriptor()->getImplementationType()) { - specificPrepareMemory5D(itpd); - std::shared_ptr selected_desc_ptr = desc; - depthwise_forward::primitive_desc prim_desc = depthwise_forward::primitive_desc(*selected_desc_ptr, getEngine()); - return prim_desc; - } - itpd++; - } - THROW_IE_EXCEPTION << "Cannot create specific primitive descriptor for depthwise node " << getName() << "."; - } - }; - - auto prim_desc = createRightPrimitiveDescriptor(); - - if (isBroadcast()) { - float broadcastValue = static_cast(internalBlobMemory[0]->GetData())[0]; - size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; - for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) { - static_cast(internalBlobMemory[0]->GetData())[i] = broadcastValue; - } - - if (isWithBiases()) { - blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; - broadcastValue = static_cast(internalBlobMemory[1]->GetData())[0]; - for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) { - static_cast(internalBlobMemory[1]->GetData())[i] = broadcastValue; - } - } - } else { - size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; - if (realWeightSize != blbSize) - THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect weights!"; - if (isWithBiases()) { - blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; - if (realBiasSize != blbSize) - THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect biases!"; - } - } - - if (isWithBiases()) { - prim.reset(new depthwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), - internalBlobMemory[0]->GetPrimitive(), - internalBlobMemory[1]->GetPrimitive(), - getChildEdgeAt(0)->getMemory().GetPrimitive())); - } else { - prim.reset(new depthwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), - internalBlobMemory[0]->GetPrimitive(), - getChildEdgeAt(0)->getMemory().GetPrimitive())); - } -} - -bool MKLDNNDepthwiseNode::created() const { - return getType() == Depthwise; -} - -void MKLDNNDepthwiseNode::init() { - GenericLayer* depthwiseLayer = getCnnLayer().get(); - if (depthwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get CNNLayer."; - - CaselessEq comparator; - if (comparator(depthwiseLayer->type, "ScaleShift")) { - auto *scshLayer = dynamic_cast(getCnnLayer().get()); - if (scshLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get scale shift layer " << getName(); - if (scshLayer->_weights == nullptr) - THROW_IE_EXCEPTION << "ScaleShift without weights is not supported"; - - algorithm = depthwise_scale_shift; - withBiases = scshLayer->_biases != nullptr; - broadcast = static_cast(scshLayer->_broadcast); - } else if (comparator(depthwiseLayer->type, "PReLU")) { - auto *preluLayer = dynamic_cast(getCnnLayer().get()); - if (preluLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get PReLU layer " << getName(); - if (preluLayer->_weights == nullptr) - THROW_IE_EXCEPTION << "PReLU without weights is not supported"; - - algorithm = depthwise_prelu; - withBiases = false; - broadcast = preluLayer->_channel_shared; - } else { - THROW_IE_EXCEPTION << "Unsupported depthwise operation"; - } -} - -void MKLDNNDepthwiseNode::createDescriptor(const std::vector &inputDesc, - const std::vector &outputDesc) { - MKLDNNMemoryDesc in_candidate(inputDesc[0]); - MKLDNNMemoryDesc out_candidate(inputDesc[0]); - MKLDNNDims weightDims({in_candidate.getDims().ndims() == 1 ? in_candidate.getDims()[0] : in_candidate.getDims()[1]}); - - MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x}; - - if (isWithBiases()) { - MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x}; - MKLDNNDescriptor desc(std::shared_ptr( - new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate))); - descs.push_back(desc); - } else { - MKLDNNDescriptor desc(std::shared_ptr( - new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate))); - descs.push_back(desc); - } -} - -void MKLDNNDepthwiseNode::initOptimalPrimitiveDescriptor() { - auto selected_pd = getSelectedPrimitiveDescriptor(); - if (selected_pd == nullptr) - THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set."; - auto config = selected_pd->getConfig(); - if (isInitConfig(config)) - return; - - if (config.inConfs.size() != 1 || config.outConfs.size() != 1 || (!isUninitTensorDesc(config.inConfs[0].desc) && - !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc)) - THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!"; - - if (getParentEdgeAt(0)->getDims().ndims() > 5) - return; - - if (!isUninitTensorDesc(config.inConfs[0].desc)) { - config.outConfs[0].desc = config.inConfs[0].desc; - } else if (!isUninitTensorDesc(config.outConfs[0].desc)) { - config.inConfs[0].desc = config.outConfs[0].desc; - } else { - config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0); - } - - initDescriptor(config); -} - -void MKLDNNDepthwiseNode::createSpecificDescriptor5D() { - auto parentOutDims = getParentEdgeAt(0)->getDims(); - MKLDNNDims newDims; - for (int i = 0; i < 4; i++) - newDims.push_back(parentOutDims[i]); - int lastDim = 1; - for (int i = 4; i < parentOutDims.ndims(); i++) { - lastDim *= parentOutDims[i]; - } - newDims.push_back(lastDim); - - MKLDNNMemoryDesc in_candidate{newDims, MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32), mkldnn::memory::ncdhw}; - MKLDNNMemoryDesc out_candidate(in_candidate); - MKLDNNDims weightDims({in_candidate.getDims()[1]}); - - MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x}; - - if (isWithBiases()) { - MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x}; - MKLDNNDescriptor desc(std::shared_ptr( - new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate))); - specificDesc5DPtr = std::make_shared(desc); - } else { - MKLDNNDescriptor desc(std::shared_ptr( - new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate))); - specificDesc5DPtr = std::make_shared(desc); - } -} - -void MKLDNNDepthwiseNode::specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd) { - std::vector intDescs; - for (auto &it : internalBlobDesc) - intDescs.push_back(it(itpd, 0)); - - internalBlobMemory.clear(); - for (size_t i = 0; i < internalBlobs.size(); i++) { - const auto &internalBlob = internalBlobs[i]; - - auto create = [&] () { - auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc()); - auto newFormat = newDesc.getFormat(); - if (newFormat == mkldnn::memory::ncdhw) { - newFormat = mkldnn::memory::goihw; - } - if (newFormat == mkldnn::memory::nchw) { - newFormat = mkldnn::memory::oihw; - } - - MKLDNNMemory memory{ getEngine() }; - memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer()); - - MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())); - _ptr->Create(intDescs[i]); - _ptr->SetData(memory); - - return _ptr; - }; - - MKLDNNMemoryPtr ptr; - if (weightCache != nullptr) { - const uint64_t data_hash = weightCache->GetHashFunc().hash( - internalBlob->buffer(), internalBlob->byteSize()); - - const std::string string_hash = getName() + "_" + std::to_string(i) - + "_" + std::to_string(internalBlob->byteSize()) - + "_" + std::to_string(data_hash); - - ptr = weightCache->findOrCreate(string_hash, create); - } else { - ptr = create(); - } - - internalBlobMemory.push_back(ptr); - } -} - -REG_MKLDNN_PRIM_FOR(MKLDNNDepthwiseNode, Depthwise); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h deleted file mode 100644 index 01f9648..0000000 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include -#include -#include -#include - -namespace MKLDNNPlugin { - -class MKLDNNDepthwiseNode : public MKLDNNNode { -public: - MKLDNNDepthwiseNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); - ~MKLDNNDepthwiseNode() override = default; - - void createDescriptor(const std::vector& inputDesc, - const std::vector& outputDesc) override; - void initOptimalPrimitiveDescriptor() override; - void getSupportedDescriptors() override; - void initSupportedPrimitiveDescriptors() override; - void createPrimitive() override; - bool created() const override; - - mkldnn::algorithm getAlgorithm() const { return algorithm; } - bool isWithBiases() const { return withBiases; } - bool isBroadcast() const { return broadcast; } - -private: - void init() override; - - mkldnn::algorithm algorithm = mkldnn::algorithm::depthwise_scale_shift; - size_t realWeightSize = 0; - size_t realBiasSize = 0; - bool withBiases = false; - bool broadcast = false; - - std::shared_ptr specificDesc5DPtr; - void createSpecificDescriptor5D(); - void specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd); -}; - -} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 7f36301..2c0fc6f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -13,46 +13,84 @@ #include #include "ie_parallel.hpp" #include "mkldnn_quantize_node.h" -#include "mkldnn_activation_node.h" #include #include "jit_uni_eltwise.hpp" #include "jit_uni_quantization.hpp" +#include "common/emitter.h" +#include "jit_eltwise_emitters.hpp" +#include "jit_mkldnn_emitters.hpp" +#include "ref_eltwise.hpp" +#include "mkldnn_pooling_node.h" -using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; -using namespace mkldnn::impl; -using namespace mkldnn::impl::cpu; using namespace mkldnn::impl::utils; + +using namespace mkldnn::impl::cpu; using namespace Xbyak; -#define GET_OFF(field) offsetof(jit_eltwise_fq_call_args, field) +#define GET_OFF(field) offsetof(jit_eltwise_call_args, field) template -struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_fq_generic) - - explicit jit_uni_eltwise_fq_generic(jit_eltwise_fq_params jep, const mkldnn_primitive_attr &attr) : jit_uni_eltwise_fq_kernel(jep, attr), jit_generator() { - const auto &p = attr_.post_ops_; - for (int i = 0; i < p.len_; i++) { - auto &post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta)); - } else if (post_op.is_quantization()) { +struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic) + + explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() { + Precision exec_prc = Precision::UNSPECIFIED; + + std::set supported_precision_intersection = get_supported_precisions(eltwiseNode); + for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) { + if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) { + std::set prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get()); + + std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(), + prcs.begin(), prcs.end(), std::inserter(supported_precision_intersection, supported_precision_intersection.begin())); + } + } + + for (auto prc : exec_precisions_priority) { + if (std::find(supported_precision_intersection.begin(), supported_precision_intersection.end(), prc) != supported_precision_intersection.end()) { + exec_prc = prc; + break; + } + } + + for (int i = 0; i < jep_.inputs_number; i++) { + if (jep_.src_prc[i] != exec_prc) { + exec_prc = Precision::FP32; + break; + } + } + + if (exec_prc == Precision::UNSPECIFIED) { + THROW_IE_EXCEPTION << "Eltwise jitter failed to specify execution precision for Eltwise node with name `" << eltwiseNode.getName() << "`"; + } + + eltwise_emitter = create_eltwise_emitter(eltwiseNode, exec_prc); + + mkldnn::post_ops post_ops; + for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) { + if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) { + post_op_emitters.push_back(create_eltwise_emitter(*eltwiseNode.getFusedWith()[i].get(), exec_prc)); + } else if (eltwiseNode.getFusedWith()[i].get()->getType() == Quantize) { + auto quantizeNode = dynamic_cast(eltwiseNode.getFusedWith()[i].get()); + quantizeNode->appendPostOps(post_ops); + quantization_injectors.push_back(std::make_shared>( - this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + this, post_ops.get()->entry_[post_ops.get()->len_ - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); } } this->preamble(); - mov(reg_src0, ptr[reg_params + GET_OFF(src0)]); - mov(reg_src1, ptr[reg_params + GET_OFF(src1)]); + for (int i = 0; i < jep.inputs_number; i++) + mov(get_src_reg(i), ptr[reg_params + GET_OFF(src_ptr[0]) + i * sizeof(size_t)]); mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - xor_(reg_oc_off, reg_oc_off); + mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]); + Xbyak::Label unroll_loop_label; + Xbyak::Label unroll_loop_end_label; Xbyak::Label main_loop_label; Xbyak::Label main_loop_end_label; Xbyak::Label tail_loop_label; @@ -61,131 +99,145 @@ struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit if (isa == avx512_common) vpxord(vmm_zero, vmm_zero, vmm_zero); - if (jep.src0_step == 0) - uni_vbroadcastss(vmm_src0, ptr[reg_src0]); - if (jep.src1_step == 0) - uni_vbroadcastss(vmm_src1, ptr[reg_src1]); + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] == 1) + load_vector(get_vmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc, true); + } - L(main_loop_label); - { - cmp(reg_work_amount, simd_w); - jl(main_loop_end_label, T_NEAR); - - if (jep.src0_step != 0) - load_vector(vmm_src0, ptr[reg_src0], jep.src0_dt); - if (jep.src1_step != 0) - load_vector(vmm_src1, ptr[reg_src1], jep.src1_dt); - - switch (jep.eltwise_op) { - case EltwiseLayer::eOperation::Sum: - if (isa == cpu::sse42) { - uni_vmovups(vmm_dst, vmm_src0); - uni_vaddps(vmm_dst, vmm_dst, vmm_src1); - } else { - uni_vaddps(vmm_dst, vmm_src0, vmm_src1); - } - break; - case EltwiseLayer::eOperation::Prod: - if (isa == cpu::sse42) { - uni_vmovups(vmm_dst, vmm_src0); - uni_vmulps(vmm_dst, vmm_dst, vmm_src1); - } else { - uni_vmulps(vmm_dst, vmm_src0, vmm_src1); - } - break; - default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node"; + size_t min_src_size = jep.dst_size; + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + min_src_size = std::min(min_src_size, jep.src_size[i]); + } + if (jep_.oc_size > 1) + min_src_size = std::min(min_src_size, jep_.oc_size); + + if (min_src_size != jep.dst_size) { + bool is_valid_configuration = true; + if (jep.dst_size % min_src_size != 0) + is_valid_configuration = false; + + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1 && jep.src_size[i] != min_src_size && jep.src_size[i] != jep.dst_size) + is_valid_configuration = false; } - int eltwise_inj_idx = 0; - int quantization_inj_idx = 0; - for (int i = 0; i < p.len_; i++) { - auto &post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); - eltwise_inj_idx++; - } else if (post_op.is_quantization()) { - bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || jep_.dst_dt == data_type::f32 || i != p.len_ - 1; - int s_idx = vmm_dst.getIdx(); + if (jep_.oc_size > 1 && jep_.oc_size != min_src_size && jep_.oc_size != jep.dst_size) + is_valid_configuration = false; + + if (!is_valid_configuration) + THROW_IE_EXCEPTION << "Eltwise jitter has invalid configuration for Eltwise node with name `" << eltwiseNode.getName() << "`"; + + L(unroll_loop_label); + { + size_t loop_step = min_src_size; + size_t vec_step = cpu_isa_traits::vlen / exec_prc.size(); - quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0); + cmp(reg_work_amount, loop_step); + jl(unroll_loop_end_label, T_NEAR); - quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding); + for (int j = 0; j < min_src_size / vec_step; j++) { + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + load_vector(get_vmm_reg(i), ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], jep.src_prc[i], exec_prc, false); + } + + compute_eltwise_op(); - quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0); + apply_post_ops(false, jep_.oc_size > 1 ? j * vec_step * sizeof(float) : 0); - quantization_inj_idx++; + store_vector(ptr[reg_dst + j * vec_step * jep.dst_prc.size()], vmm_dst, exec_prc, jep.dst_prc); } - } - store_vector(ptr[reg_dst], vmm_dst, jep.dst_dt); + int tail_start = min_src_size - min_src_size % vec_step; + for (int j = tail_start; j < min_src_size; j++) { + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + load_scalar(get_xmm_reg(i), ptr[get_src_reg(i) + j * jep.src_prc[i].size()], jep.src_prc[i], exec_prc); + } - if (jep.src0_step != 0) - add(reg_src0, jep.src0_step * jep.src0_data_size * simd_w); - if (jep.src1_step != 0) - add(reg_src1, jep.src1_step * jep.src1_data_size * simd_w); - add(reg_dst, jep.dst_step * jep.dst_data_size * simd_w); - sub(reg_work_amount, simd_w); - add(reg_oc_off, simd_w * sizeof(float)); + compute_eltwise_op(); - jmp(main_loop_label, T_NEAR); - } + apply_post_ops(true, jep_.oc_size > 1 ? j * sizeof(float) : 0); - L(main_loop_end_label); + store_scalar(ptr[reg_dst + j * jep.dst_prc.size()], xmm_dst, exec_prc, jep.dst_prc); + } - L(tail_loop_label); - { - cmp(reg_work_amount, 1); - jl(tail_loop_end_label, T_NEAR); + for (int i = 0; i < jep.inputs_number; i++) + if (jep.src_size[i] == jep.dst_size) + add(get_src_reg(i), jep.src_prc[i].size() * loop_step); - if (jep.src0_step != 0) - load_scalar(xmm_src0, ptr[reg_src0], jep.src0_dt); - if (jep.src1_step != 0) - load_scalar(xmm_src1, ptr[reg_src1], jep.src1_dt); + add(reg_dst, jep.dst_prc.size() * loop_step); + sub(reg_work_amount, loop_step); + if (jep_.oc_size > 1 && jep_.oc_size != min_src_size) + add(reg_oc_off, loop_step * sizeof(float)); - switch (jep.eltwise_op) { - case EltwiseLayer::eOperation::Sum: uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break; - case EltwiseLayer::eOperation::Prod: uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break; - default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node"; + jmp(unroll_loop_label, T_NEAR); } - int eltwise_inj_idx = 0; - int quantization_inj_idx = 0; - for (int i = 0; i < p.len_; i++) { - auto &post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); - eltwise_inj_idx++; - } else if (post_op.is_quantization()) { - bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || jep_.dst_dt == data_type::f32 || i != p.len_ - 1; - int s_idx = vmm_dst.getIdx(); - - quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, true); + L(unroll_loop_end_label); + } - quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, true); + if (min_src_size == jep.dst_size) { + L(main_loop_label); + { + size_t loop_step = cpu_isa_traits::vlen / exec_prc.size(); - quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, true); + cmp(reg_work_amount, loop_step); + jl(main_loop_end_label, T_NEAR); - quantization_inj_idx++; + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + load_vector(get_vmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc, false); } + + compute_eltwise_op(); + + apply_post_ops(false); + + store_vector(ptr[reg_dst], vmm_dst, exec_prc, jep.dst_prc); + + for (int i = 0; i < jep.inputs_number; i++) + if (jep.src_size[i] != 1) + add(get_src_reg(i), jep.src_prc[i].size() * loop_step); + + add(reg_dst, jep.dst_prc.size() * loop_step); + sub(reg_work_amount, loop_step); + if (jep_.oc_size > 1) + add(reg_oc_off, loop_step * sizeof(float)); + + jmp(main_loop_label, T_NEAR); + } + + L(main_loop_end_label); + } + + L(tail_loop_label); + { + size_t loop_step = 1; + + cmp(reg_work_amount, loop_step); + jl(tail_loop_end_label, T_NEAR); + + for (int i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + load_scalar(get_xmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc); } - store_scalar(ptr[reg_dst], xmm_dst, jep.dst_dt); + compute_eltwise_op(); + + apply_post_ops(true); + + store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc); - if (jep.src0_step != 0) - add(reg_src0, jep.src0_step * jep.src0_data_size); - if (jep.src1_step != 0) - add(reg_src1, jep.src1_step * jep.src1_data_size); - add(reg_dst, jep.dst_step * jep.dst_data_size); - sub(reg_work_amount, 1); - add(reg_oc_off, 1 * sizeof(float)); + for (int i = 0; i < jep.inputs_number; i++) + if (jep.src_size[i] != 1) + add(get_src_reg(i), jep.src_prc[i].size() * loop_step); + + add(reg_dst, jep.dst_prc.size() * loop_step); + sub(reg_work_amount, loop_step); + if (jep_.oc_size > 1) + add(reg_oc_off, loop_step * sizeof(float)); jmp(tail_loop_label, T_NEAR); } @@ -194,8 +246,10 @@ struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit this->postamble(); - for (auto& inj : eltwise_injectors) - inj->prepare_table(); + eltwise_emitter->emit_table(); + for (int i = 0; i < post_op_emitters.size(); i++) { + post_op_emitters[i]->emit_table(); + } ker_ = (decltype(ker_)) this->getCode(); } @@ -203,95 +257,306 @@ struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit private: using Vmm = typename conditional3::type; - const int simd_w = cpu_isa_traits::vlen / sizeof(float); + Reg64 get_src_reg(int idx) { + return Reg64(r8.getIdx() + idx); + } + + Vmm get_vmm_reg(int idx) { + return Vmm(1 + idx); + } + + Vmm get_aux_vmm(int idx) { + return Vmm(10 + idx); + } + + Xmm get_xmm_reg(int idx) { + return Xmm(get_vmm_reg(idx).getIdx()); + } + + Reg64 reg_dst = rbx; + Reg64 reg_work_amount = rdx; - Reg64 reg_src0 = r8; - Reg64 reg_src1 = r9; - Reg64 reg_dst = r10; - Reg64 reg_work_amount = r11; - Reg64 reg_oc_off = r13; + Reg64 reg_oc_off = abi_not_param1; Reg64 reg_params = abi_param1; - Reg8 reg_tmp_8 = r12b; - Reg32 reg_tmp_32 = r12d; - Reg64 reg_tmp_64 = r12; + Reg8 reg_tmp_8 = Reg8(r15.getIdx()); + Reg32 reg_tmp_32 = Reg32(r15.getIdx()); + Reg64 reg_tmp_64 = Reg64(r15.getIdx()); - Reg64 reg_d_weights = r14; - Reg64 reg_d_bias = r15; + Reg64 reg_d_weights = rbp; + Reg64 reg_d_bias = rsi; - Vmm vmm_src0 = Vmm(0); - Vmm vmm_src1 = Vmm(1); - Vmm vmm_dst = Vmm(2); - Xmm xmm_src0 = Xmm(0); - Xmm xmm_src1 = Xmm(1); - Xmm xmm_dst = Xmm(2); + Vmm vmm_dst = Vmm(9); + Xmm xmm_dst = Xmm(9); - Vmm vmm_d_weights = Vmm(3); - Vmm vmm_d_bias = Vmm(4); + Vmm vmm_d_weights = Vmm(12); + Vmm vmm_d_bias = Vmm(13); + Vmm vmm_zero = Vmm(15); - Vmm vmm_zero = Vmm(5); + std::shared_ptr eltwise_emitter = nullptr; + std::vector> post_op_emitters = {}; - std::vector>> eltwise_injectors; - std::vector>> quantization_injectors; + std::vector>> quantization_injectors = {}; - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { - switch (src_dt) { - case memory::f32: - case memory::s32: - uni_vmovups(vmm_src, op); - break; - case memory::s8: - uni_vpmovsxbd(vmm_src, op); - break; - case memory::u8: - uni_vpmovzxbd(vmm_src, op); - break; - default: - assert(!"unknown dst_dt"); + std::vector exec_precisions_priority = { + Precision::U8, + Precision::I8, + Precision::U16, + Precision::I16, + Precision::BF16, + Precision::I32, + Precision::FP32 + }; + + std::set get_supported_precisions(MKLDNNNode& node) { + auto& eltwiseNode = dynamic_cast(node); + switch (eltwiseNode.getOpType()) { + case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt: + case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid: + return jit_mkldnn_emitter::get_supported_precisions(); + case Add: return jit_add_emitter::get_supported_precisions(); + case MulAdd: return jit_mul_add_emitter::get_supported_precisions(); + case Subtract: return jit_subtract_emitter::get_supported_precisions(); + case Multiply: return jit_multiply_emitter::get_supported_precisions(); + case Divide: return jit_divide_emitter::get_supported_precisions(); + case FloorMod: return jit_floor_mod_emitter::get_supported_precisions(); + case Mod: return jit_mod_emitter::get_supported_precisions(); + case Maximum: return jit_maximum_emitter::get_supported_precisions(); + case Minimum: return jit_minimum_emitter::get_supported_precisions(); + case SquaredDifference: return jit_squared_difference_emitter::get_supported_precisions(); + case PowerDynamic: return jit_power_dynamic_emitter::get_supported_precisions(); + case Equal: return jit_equal_emitter::get_supported_precisions(); + case NotEqual: return jit_not_equal_emitter::get_supported_precisions(); + case Greater: return jit_greater_emitter::get_supported_precisions(); + case GreaterEqual: return jit_greater_equal_emitter::get_supported_precisions(); + case Less: return jit_less_emitter::get_supported_precisions(); + case LessEqual: return jit_less_equal_emitter::get_supported_precisions(); + case LogicalAnd: return jit_logical_and_emitter::get_supported_precisions(); + case LogicalOr: return jit_logical_or_emitter::get_supported_precisions(); + case LogicalXor: return jit_logical_xor_emitter::get_supported_precisions(); + case LogicalNot: return jit_logical_not_emitter::get_supported_precisions(); + case PowerStatic: return jit_power_static_emitter::get_supported_precisions(); + case Prelu: return jit_prelu_emitter::get_supported_precisions(); + default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise emitter"; + } + } + + std::shared_ptr create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) { + auto& eltwiseNode = dynamic_cast(node); + switch (eltwiseNode.getOpType()) { + case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt: + case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid: + return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Add: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case MulAdd: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Subtract: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Multiply: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Divide: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case FloorMod: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Mod: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Maximum: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Minimum: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case SquaredDifference: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case PowerDynamic: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Equal: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case NotEqual: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Greater: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case GreaterEqual: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Less: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case LessEqual: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case LogicalAnd: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case LogicalOr: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case LogicalXor: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case LogicalNot: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case PowerStatic: return std::make_shared(this, isa, eltwiseNode, exec_prec); + case Prelu: return std::make_shared(this, isa, eltwiseNode, exec_prec); + default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise emitter"; + } + } + + inline void compute_eltwise_op() { + std::vector in_idxs; + std::vector aux_idxs; + for (int i = 0; i < eltwise_emitter->get_inputs_num(); i++) + in_idxs.push_back(get_vmm_reg(i).getIdx()); + for (int i = 0; i < eltwise_emitter->aux_vecs_count(); i++) + aux_idxs.push_back(get_aux_vmm(i).getIdx()); + + std::vector out_idxs; + out_idxs.push_back(vmm_dst.getIdx()); + + eltwise_emitter->emit(in_idxs, out_idxs, aux_idxs); + } + + inline void apply_post_ops(bool is_scalar, int offset = 0) { + int input_idx = eltwise_emitter->get_inputs_num(); + int eltwise_post_op_idx = 0; + int quantization_post_op_idx = 0; + for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) { + if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) { + std::vector in_idxs; + std::vector aux_idxs; + in_idxs.push_back(vmm_dst.getIdx()); + for (int j = 1; j < post_op_emitters[eltwise_post_op_idx]->get_inputs_num(); j++) + in_idxs.push_back(get_vmm_reg(input_idx++).getIdx()); + for (int j = 0; j < post_op_emitters[eltwise_post_op_idx]->aux_vecs_count(); j++) + aux_idxs.push_back(get_aux_vmm(j).getIdx()); + + std::vector out_idxs; + out_idxs.push_back(vmm_dst.getIdx()); + + post_op_emitters[eltwise_post_op_idx]->emit(in_idxs, out_idxs, aux_idxs); + + eltwise_post_op_idx++; + } else { + auto quantizeNode = dynamic_cast(eltwiseNode.getFusedWith()[i].get()); + + bool do_dequantization = quantizeNode->getAlgorithm() == mkldnn::quantization_quantize_dequantize; + bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != eltwiseNode.getFusedWith().size() - 1; + int s_idx = vmm_dst.getIdx(); + + quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); + + quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding, + is_scalar, jep_.oc_size == 1); + + quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); + + quantization_post_op_idx++; + } } + } + + inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc, bool broadcast) { + Xmm xmm_src = Xmm(vmm_src.getIdx()); + + if (broadcast) { + load_scalar(xmm_src, op, src_prc, dst_prc); + uni_vbroadcastss(vmm_src, xmm_src); + } else { + switch (src_prc) { + case Precision::FP32: + case Precision::I32: + uni_vmovups(vmm_src, op); + break; + case Precision::BF16: + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; + case Precision::U16: + uni_vpmovzxwd(vmm_src, op); + break; + case Precision::I16: + uni_vpmovsxwd(vmm_src, op); + break; + case Precision::I8: + uni_vpmovsxbd(vmm_src, op); + break; + case Precision::U8: + uni_vpmovzxbd(vmm_src, op); + break; + default: + assert(!"unknown src_prc"); + } - if (src_dt != data_type::f32) { - uni_vcvtdq2ps(vmm_src, vmm_src); + switch (dst_prc) { + case Precision::FP32: + if (src_prc != Precision::FP32 && src_prc != Precision::BF16) + uni_vcvtdq2ps(vmm_src, vmm_src); + break; + case Precision::I32: + break; + default: + assert(!"unknown dst_prc"); + } } } - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) { - switch (src_dt) { - case memory::f32: - case memory::s32: + inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc) { + switch (src_prc) { + case Precision::FP32: + case Precision::I32: movss(xmm_src, op); break; - case memory::s8: + case Precision::BF16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpslld(xmm_src, xmm_src, 16); + break; + case Precision::I16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpmovsxwd(xmm_src, op); + break; + case Precision::U16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpmovzxwd(xmm_src, op); + break; + case Precision::I8: movsx(reg_tmp_32, op); movq(xmm_src, reg_tmp_64); break; - case memory::u8: + case Precision::U8: movzx(reg_tmp_32, op); movq(xmm_src, reg_tmp_64); break; default: - assert(!"unknown dst_dt"); + assert(!"unknown src_prc"); } - if (src_dt != data_type::f32) { - uni_vcvtdq2ps(xmm_src, xmm_src); + switch (dst_prc) { + case Precision::FP32: + if (src_prc != Precision::FP32 && src_prc != Precision::BF16) + uni_vcvtdq2ps(xmm_src, xmm_src); + break; + case Precision::I32: + break; + default: + assert(!"unknown dst_prc"); } } - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) { + inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, Precision src_prc, Precision dst_prc) { Xmm xmm_dst = Xmm(vmm_dst.getIdx()); Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - if (dst_dt != data_type::f32) { - uni_vcvtps2dq(vmm_dst, vmm_dst); + switch (src_prc) { + case Precision::FP32: + if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16) + uni_vcvtps2dq(vmm_dst, vmm_dst); + break; + case Precision::I32: + break; + default: + assert(!"unknown src_prc"); } - switch (dst_dt) { - case memory::f32: - case memory::s32: + switch (dst_prc) { + case Precision::FP32: + case Precision::I32: uni_vmovups(op, vmm_dst); break; - case memory::s8: + case Precision::BF16: + vcvtneps2bf16(ymm_dst, vmm_dst); + uni_vmovups(op, ymm_dst); + break; + case Precision::I16: + if (isa == avx512_common) { + vmaxps(vmm_dst, vmm_zero, vmm_dst); + vpmovusdw(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + } + break; + case Precision::U16: + if (isa == avx512_common) { + vpmovsdw(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + } + break; + case Precision::I8: if (isa == avx512_common) { vmaxps(vmm_dst, vmm_zero, vmm_dst); vpmovsdb(op, vmm_dst); @@ -306,7 +571,7 @@ private: movd(op, xmm_dst); } break; - case memory::u8: + case Precision::U8: if (isa == avx512_common) { vpmovusdb(op, vmm_dst); } else { @@ -321,2377 +586,1069 @@ private: } break; default: - assert(!"unknown dst_dt"); + assert(!"unknown dst_prc"); } } - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { - if (dst_dt != data_type::f32) { - uni_vcvtps2dq(xmm_dst, xmm_dst); + inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, Precision src_prc, Precision dst_prc) { + switch (src_prc) { + case Precision::FP32: + if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16) + uni_vcvtps2dq(xmm_dst, xmm_dst); + break; + case Precision::I32: + break; + default: + assert(!"unknown src_prc"); } - switch (dst_dt) { - case memory::f32: - case memory::s32: + switch (dst_prc) { + case Precision::FP32: + case Precision::I32: movss(op, xmm_dst); break; - case memory::s8: + case Precision::BF16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + uni_vpextrw(op, xmm_dst, 0x0); + break; + case Precision::I16: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case Precision::U16: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case Precision::I8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); movq(reg_tmp_64, xmm_dst); mov(op, reg_tmp_8); break; - case memory::u8: + case Precision::U8: uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); movq(reg_tmp_64, xmm_dst); mov(op, reg_tmp_8); break; default: - assert(!"unknown dst_dt"); + assert(!"unknown dst_prc"); } } }; MKLDNNEltwiseNode::MKLDNNEltwiseNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : - MKLDNNNode(layer, eng, cache), eltiwse_fq_kernel(nullptr) { - op = EltwiseLayer::Sum; + MKLDNNNode(layer, eng, cache) { } -bool MKLDNNEltwiseNode::isSum() { - auto * eltwiseLayer = dynamic_cast(getCnnLayer().get()); - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName(); - return eltwiseLayer->_operation == EltwiseLayer::Sum; -} +InferenceEngine::details::caseless_map> +MKLDNNEltwiseNode::initializers = { + {"relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f); + beta = 0.0f; + opType = Relu; + algorithm = mkldnn::eltwise_relu; + }}, + {"gelu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Gelu; + algorithm = mkldnn::eltwise_gelu; + }}, + {"elu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("alpha", 1.0f); + beta = 0.0f; + opType = Elu; + algorithm = mkldnn::eltwise_elu; + }}, + {"tanh", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Tanh; + algorithm = mkldnn::eltwise_tanh; + }}, + {"sigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Logistic; + algorithm = mkldnn::eltwise_logistic; + }}, + {"logistic", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Logistic; + algorithm = mkldnn::eltwise_logistic; + }}, + {"square", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Square; + algorithm = mkldnn::eltwise_square; + }}, + {"abs", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Abs; + algorithm = mkldnn::eltwise_abs; + }}, + {"sqrt", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Sqrt; + algorithm = mkldnn::eltwise_sqrt; + }}, + {"linear", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("alpha", 1.0f); + beta = activationLayer->GetParamAsFloat("beta", 0.0f); + opType = Linear; + algorithm = mkldnn::eltwise_linear; + }}, + {"bounded_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("alpha", 0.0f); + beta = 0.0f; + opType = BoundedRelu; + algorithm = mkldnn::eltwise_bounded_relu; + }}, + {"soft_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = SoftRelu; + algorithm = mkldnn::eltwise_soft_relu; + }}, + {"relu6", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("n", 6.0f); + beta = 0.0f; + opType = Relu6; + algorithm = mkldnn::eltwise_bounded_relu; + }}, + {"clamp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("max", 1.0f); + beta = activationLayer->GetParamAsFloat("min", 0.0f); + opType = Clamp; + algorithm = mkldnn::eltwise_clamp; + }}, + {"exp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Exp; + algorithm = mkldnn::eltwise_exp; + }}, + {"not", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = LogicalNot; + }}, + {"swish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = activationLayer->GetParamAsFloat("alpha", 1.0f); + beta = 0.0f; + opType = Swish; + algorithm = mkldnn::eltwise_swish; + }}, + {"hswish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Hswish; + algorithm = mkldnn::eltwise_hswish; + }}, + {"mish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Mish; + algorithm = mkldnn::eltwise_mish; + }}, + {"hsigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + opType = Hsigmoid; + algorithm = mkldnn::eltwise_hsigmoid; + }}, +}; -bool MKLDNNEltwiseNode::isUnitScales() { - auto * eltwiseLayer = dynamic_cast(getCnnLayer().get()); - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName(); +void MKLDNNEltwiseNode::init() { + InferenceEngine::details::CaselessEq comparator; + auto layerType = getCnnLayer().get()->type; - if (eltwiseLayer->coeff.empty()) - return true; + auto * eltwiseLayer = dynamic_cast(getCnnLayer().get()); + if (eltwiseLayer) { + if (!eltwiseLayer->coeff.empty()) + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support input coefficients."; + + switch (eltwiseLayer->_operation) { + case EltwiseLayer::Sum: eltwiseOp = Add; break; + case EltwiseLayer::Prod: eltwiseOp = Multiply; break; + case EltwiseLayer::Max: eltwiseOp = Maximum; break; + case EltwiseLayer::Sub: eltwiseOp = Subtract; break; + case EltwiseLayer::Min: eltwiseOp = Minimum; break; + case EltwiseLayer::Div: eltwiseOp = Divide; break; + case EltwiseLayer::Squared_diff: eltwiseOp = SquaredDifference; break; + case EltwiseLayer::Floor_mod: eltwiseOp = FloorMod; break; + case EltwiseLayer::Pow: eltwiseOp = PowerDynamic; break; + case EltwiseLayer::Equal: eltwiseOp = Equal; break; + case EltwiseLayer::Not_equal: eltwiseOp = NotEqual; break; + case EltwiseLayer::Greater: eltwiseOp = Greater; break; + case EltwiseLayer::Greater_equal: eltwiseOp = GreaterEqual; break; + case EltwiseLayer::Less: eltwiseOp = Less; break; + case EltwiseLayer::Less_equal: eltwiseOp = LessEqual; break; + case EltwiseLayer::Logical_AND: eltwiseOp = LogicalAnd; break; + case EltwiseLayer::Logical_OR: eltwiseOp = LogicalOr; break; + case EltwiseLayer::Logical_XOR: eltwiseOp = LogicalXor; break; + default: THROW_IE_EXCEPTION << "Unsupported algorithm for Eltwise node with name `" << getName() << "`."; + } + } else if (comparator(layerType, "mod")) { + eltwiseOp = Mod; + } else if (comparator(layerType, "power")) { + eltwiseOp = PowerStatic; + + auto *powerLayer = dynamic_cast(getCnnLayer().get()); + if (powerLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert power layer."; + + alpha = powerLayer->power; + beta = powerLayer->scale; + gamma = powerLayer->offset; + } else if (comparator(layerType, "scaleshift")) { + if (getCnnLayer().get()->blobs.size() == 2) { + eltwiseOp = MulAdd; + eltwiseAlgorithm = mkldnn::depthwise_scale_shift; + } else { + eltwiseOp = Multiply; + } + } else if (comparator(layerType, "prelu")) { + eltwiseOp = Prelu; + eltwiseAlgorithm = mkldnn::depthwise_prelu; + } else if (comparator(layerType, "activation") && initializers.find(getCnnLayer().get()->GetParamAsString("type")) != initializers.end()) { + initializers[getCnnLayer().get()->GetParamAsString("type")](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta); + } else if (comparator(layerType, "relu") || + comparator(layerType, "gelu") || + comparator(layerType, "elu") || + comparator(layerType, "sigmoid") || + comparator(layerType, "logistic") || + comparator(layerType, "tanh") || + comparator(layerType, "relu6") || + comparator(layerType, "exp") || + comparator(layerType, "not") || + comparator(layerType, "clamp") || + comparator(layerType, "swish") || + comparator(layerType, "hswish") || + comparator(layerType, "mish") || + comparator(layerType, "hsigmoid")) { + initializers[layerType](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta); + } else { + THROW_IE_EXCEPTION << "Unsupported algorithm for Eltwise node with name `" << getName() << "`."; + } +} - for (auto scale : eltwiseLayer->coeff) { - if (scale != 1.0f) - return false; +size_t MKLDNNEltwiseNode::getOpInputsNum() const { + switch (getOpType()) { + case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt: case PowerStatic: + case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid: + case LogicalNot: + return 1; + case Add: case Subtract: case Multiply: case Divide: case FloorMod: case Mod: case Maximum: case Minimum: case SquaredDifference: + case PowerDynamic: case Equal: case NotEqual: case Greater: case GreaterEqual: case Less: case LessEqual: case LogicalAnd: + case LogicalOr: case LogicalXor: case Prelu: + return 2; + case MulAdd: + return 3; + default: THROW_IE_EXCEPTION << "Unsupported operation for Eltwise node with name `" << getName() << "`."; } +} - return true; +bool MKLDNNEltwiseNode::isSum() { + return eltwiseOp == Add; } bool MKLDNNEltwiseNode::isWithBroadcast() { - bool withBroadcast = false; auto oDims = outDims[0].ToSizeVector(); for (size_t i = 0; i < inDims.size(); i++) { auto iDims = inDims[i].ToSizeVector(); - for (size_t j = 1; j <= iDims.size(); j++) { - if (oDims[oDims.size() - j] != iDims[iDims.size() - j]) { - if (iDims[iDims.size() - j] == 1) { - withBroadcast = true; - } else { - THROW_IE_EXCEPTION << "Incorrect dimensions for broadcasting for " << getName(); - } - } - if (iDims.size() < oDims.size()) - withBroadcast = true; - } - if (iDims.size() == 0 && oDims.size()) - withBroadcast = true; + if (iDims != oDims) + return true; } - return withBroadcast; + return false; } void MKLDNNEltwiseNode::getSupportedDescriptors() { - auto * eltwiseLayer = dynamic_cast(getCnnLayer().get()); - - if (eltwiseLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot convert eltwise layer."; - op = eltwiseLayer->_operation; - - if (getParentEdges().size() < 2) + if (getParentEdges().size() < 1) THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); if (getChildEdges().empty()) THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); - if (op == EltwiseLayer::Squared_diff) - if (getParentEdges().size() != 2) - THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName() << " for operation squared_diff.\n" - << "Expected: 2\n" << "Actual: " << getParentEdges().size(); +} - auto outDims = getChildEdgeAt(0)->getDims(); - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto inDims = getParentEdgeAt(i)->getDims(); - batch_dim = std::min(batch_dim, 5 - inDims.ndims()); - } +void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() { + std::vector supportedPrecisions = { + Precision::FP32, + Precision::U8, + Precision::I8, + Precision::U16, + Precision::I16, + Precision::BF16, + Precision::I32 + }; - broadcast = isWithBroadcast(); - if (broadcast) { - auto outDims = getChildEdgeAt(0)->getDims(); - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto inDims = getParentEdgeAt(i)->getDims(); - if (inDims.ndims() > 5 || outDims.ndims() > 5) - THROW_IE_EXCEPTION << "Eltwise node in broadcasting mode doesn't support more than 5 dims for blobs"; + if (!supportedPrimitiveDescriptors.empty()) + return; + + canUseOptimizedImpl = mayiuse(cpu::sse42); + + size_t expectedInputsNum = getOpInputsNum(); + for (auto& postOp : fusedWith) { + auto* eltwiseNode = dynamic_cast(postOp.get()); + if (eltwiseNode != nullptr) { + expectedInputsNum += eltwiseNode->getOpInputsNum() - 1; } } + if (getParentEdges().size() > MAX_ELTWISE_INPUTS) + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support more than " << MAX_ELTWISE_INPUTS + << " inputs (actual = " << getParentEdges().size() << ")"; - bool with_coeffs = !eltwiseLayer->coeff.empty(); - if (op != EltwiseLayer::Sum && with_coeffs) - THROW_IE_EXCEPTION << "Only sum operation supports operands coefficients"; + if (expectedInputsNum != getParentEdges().size()) + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input number of inputs: expected = " << expectedInputsNum + << " (actual = " << getParentEdges().size() << ")"; - if (with_coeffs && eltwiseLayer->coeff.size() != getParentEdges().size()) - THROW_IE_EXCEPTION << "Number of provided coefficients is not equal to number of operands"; + std::vector inputPrecisions; + for (int i = 0; i < getCnnLayer()->insData.size(); i++) { + inputPrecisions.push_back(getCnnLayer()->insData[i].lock()->getPrecision()); + } - if (with_coeffs && eltwiseLayer->precision != Precision::FP32) - THROW_IE_EXCEPTION << "Sum with coefficients supports only FP32 precision"; + for (auto& fusedNode : fusedWith) { + if (fusedNode->getType() == Eltwise) { + for (int i = 1; i < fusedNode->getCnnLayer()->insData.size(); i++) { + inputPrecisions.push_back(fusedNode->getCnnLayer()->insData[i].lock()->getPrecision()); + } + } + } - sum_scales.clear(); - for (int i = 0; i < getParentEdges().size(); i++) - sum_scales.push_back(with_coeffs ? eltwiseLayer->coeff[i] : 1.0f); -} + if (inputPrecisions.size() != getParentEdges().size()) + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration."; -void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) - return; + InferenceEngine::Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision(); + if (!fusedWith.empty()) { + auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer(); + if (lastFusedLayer) { + outputPrecision = lastFusedLayer->outData[0]->getPrecision(); + } + } - setPostOps(attr, true); + if (!mayiuse(avx512_core_bf16)) { + bool hasBF16 = false; + for (auto &inPrc : inputPrecisions) + if (inPrc == Precision::BF16) + hasBF16 = true; - auto initDesc = [&] (mkldnn::memory::data_type inputDT, mkldnn::memory::data_type outputDT, memory::format format) -> PrimitiveDescInfo { - InferenceEngine::LayerConfig config; - impl_desc_type impl_type = impl_desc_type::ref; - config.dynBatchSupport = true; - for (size_t i = 0; i < getParentEdges().size(); i++) { - InferenceEngine::DataConfig dataConfig; - dataConfig.inPlace = (!i && canBeInPlace()) ? 0 : -1; - dataConfig.constant = false; + if (outputPrecision == Precision::BF16 || hasBF16) + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support BF16 precision on this target."; + } - if (!broadcast) { - dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format); - config.inConfs.push_back(dataConfig); + auto filterPrecision = [&](Precision& prc) { + if (!canUseOptimizedImpl) { + return Precision(Precision::FP32); + } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) { + if (prc == Precision::U32 || prc == Precision::I64 || prc == Precision::U64) { + return Precision(Precision::I32); } else { - // Broadcasting support - if (MKLDNNMemory::IsPlainFormat(format)) { - dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, - MKLDNNMemory::GetPlainFormat(getParentEdgeAt(i)->getDims())); - config.inConfs.push_back(dataConfig); - } else { - // Unsupported format for broadcast mode. Should be skipped. - // Will mark it as undef and outer code should filter it. - impl_type = impl_desc_type::undef; - } + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support " << prc << " precision."; } + } else { + return prc; } - - InferenceEngine::DataConfig dataConfig; - dataConfig.inPlace = -1; - dataConfig.constant = false; - dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format); - config.outConfs.push_back(dataConfig); - return {config, impl_type, format}; }; - if (fusedWith.empty()) { - for (const auto& format : getAvailableFormatsForDims(getChildEdgeAt(0)->getDims())) { - // Precision of implementation is defined by precision of output tensor - auto prec = getCnnLayer()->outData[0]->getPrecision(); - mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(prec); - mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(prec); - - // Eltwise compare operation can have the input type different from the output type - auto node_op = this->op; - bool is_eltwise_compare_node = ((node_op == EltwiseLayer::eOperation::Equal) || - (node_op == EltwiseLayer::eOperation::Not_equal) || - (node_op == EltwiseLayer::eOperation::Greater) || - (node_op == EltwiseLayer::eOperation::Greater_equal) || - (node_op == EltwiseLayer::eOperation::Less) || - (node_op == EltwiseLayer::eOperation::Less_equal)); - if (is_eltwise_compare_node) { - auto in_prec = getCnnLayer()->insData[0].lock()->getPrecision(); - inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(in_prec); - } + for (int i = 0; i < inputPrecisions.size(); i++) { + inputPrecisions[i] = filterPrecision(inputPrecisions[i]); + } + outputPrecision = filterPrecision(outputPrecision); - if (inputDT == memory::bf16 || outputDT == memory::bf16) { - inputDT = memory::f32; - outputDT = memory::f32; - } + // TODO: delete after new LPT (ngraph based) is merged + // WA is needed to handle bug in LPT that produces wrong precision after average pooling (I8/U8 instead of FP32) + if (eltwiseOp == MulAdd && (inputPrecisions[0] == Precision::U8 || inputPrecisions[0] == Precision::I8)) { + auto poolingLayer = dynamic_cast(getParentEdgesAtPort(0)[0]->getParent()->getCnnLayer().get()); + if (poolingLayer && poolingLayer->_type == PoolingLayer::AVG) { + inputPrecisions[0] = Precision::FP32; + } + } + + enum LayoutType { + Planar, + ChannelsFirst, + Blocked + }; - auto impl_desc = initDesc(inputDT, outputDT, format); + auto initDesc = [&] (LayoutType lt) -> PrimitiveDescInfo { + auto createMemoryDesc = [lt](MKLDNNEdgePtr edge, Precision prc, size_t offset) -> TensorDesc { + if (lt == ChannelsFirst) { + std::vector blocks = edge->getDims().ToSizeVector(); + std::vector order; + order.push_back(0); + for (size_t j = 2; j < blocks.size(); j++) + order.push_back(j); + if (blocks.size() > 1) + order.push_back(1); + + return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset})); + } else if (lt == Blocked && edge->getDims()[1] != 1) { + size_t blockSize = mayiuse(cpu::avx512_common) ? 16 : 8; + + std::vector blocks = edge->getDims().ToSizeVector(); + std::vector order(blocks.size()); + for (size_t j = 0; j < order.size(); j++) + order[j] = j; + + blocks[1] = div_up(blocks[1], blockSize); + blocks.push_back(blockSize); + order.push_back(1); + + return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset})); + } else { + std::vector blocks = edge->getDims().ToSizeVector(); + std::vector order(blocks.size()); + for (size_t j = 0; j < order.size(); j++) + order[j] = j; - if (impl_desc.getImplementationType() != impl_desc_type::undef) { - supportedPrimitiveDescriptors.push_back(impl_desc); + return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset})); } - } - } else { - auto ndims = getCnnLayer()->outData[0]->getDims().size(); - auto format = ndims == 2 ? memory::format::nc : - ndims == 4 ? memory::format::nhwc : - memory::format::ndhwc; + }; + size_t offset = std::numeric_limits::max(); InferenceEngine::LayerConfig config; - impl_desc_type impl_type = impl_desc_type::ref; - config.dynBatchSupport = true; + config.dynBatchSupport = getChildEdgeAt(0)->getDims().ndims() > 1 && getChildEdgeAt(0)->getDims() == getParentEdgeAt(0)->getDims(); + for (size_t i = 0; i < getParentEdges().size(); i++) { InferenceEngine::DataConfig dataConfig; - dataConfig.inPlace = -1; + dataConfig.inPlace = (!i && canBeInPlace() && inputPrecisions[i] == outputPrecision) ? 0 : -1; dataConfig.constant = false; - auto inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType( - getCnnLayer()->insData[i].lock()->getPrecision()); - dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format); - config.inConfs.push_back(dataConfig); - } - auto outputDT = memory::f32; - auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer(); - if (lastFusedLayer) { - outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(lastFusedLayer->outData[0]->getPrecision()); + + dataConfig.desc = createMemoryDesc(getParentEdgeAt(i), inputPrecisions[i], offset); + + config.inConfs.push_back(dataConfig); } InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = -1; dataConfig.constant = false; - dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format); - config.outConfs.push_back(dataConfig); - supportedPrimitiveDescriptors.push_back({config, impl_type, format}); + dataConfig.desc = createMemoryDesc(getChildEdgeAt(0), outputPrecision, offset); - jep.src0_step = config.inConfs[0].desc.getDims()[1] == 1 ? 0 : 1; - jep.src1_step = config.inConfs[1].desc.getDims()[1] == 1 ? 0 : 1; - jep.dst_step = 1; - jep.src0_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.inConfs[0].desc.getPrecision()); - jep.src1_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.inConfs[1].desc.getPrecision()); - jep.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.outConfs[0].desc.getPrecision()); - jep.src0_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.src0_dt); - jep.src1_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.src1_dt); - jep.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.dst_dt); - jep.eltwise_op = op; + config.outConfs.push_back(dataConfig); + impl_desc_type impl_type; if (mayiuse(cpu::avx512_common)) { - eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic(jep, *attr.get())); + impl_type = impl_desc_type::jit_avx512; } else if (mayiuse(cpu::avx2)) { - eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic(jep, *attr.get())); + impl_type = impl_desc_type::jit_avx2; } else if (mayiuse(cpu::sse42)) { - eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic(jep, *attr.get())); + impl_type = impl_desc_type::jit_sse42; + } else { + impl_type = impl_desc_type::ref; } + + return {config, impl_type, MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat()}; + }; + + bool isChannelsFirstApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 1, 2, 4, 5); + for (size_t i = 0; i < getParentEdges().size(); i++) { + isChannelsFirstApplicable = isChannelsFirstApplicable && one_of(getParentEdgeAt(i)->getDims().ndims(), 1, 2, 4, 5); + isChannelsFirstApplicable = isChannelsFirstApplicable && getChildEdgeAt(0)->getDims().ndims() == getParentEdgeAt(i)->getDims().ndims(); + } + + bool isBlockedApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 4, 5); + for (size_t i = 0; i < getParentEdges().size(); i++) { + isBlockedApplicable = isBlockedApplicable && one_of(getParentEdgeAt(i)->getDims().ndims(), 4, 5); + isBlockedApplicable = isBlockedApplicable && getChildEdgeAt(0)->getDims().ndims() == getParentEdgeAt(i)->getDims().ndims(); } + + if (isChannelsFirstApplicable) + supportedPrimitiveDescriptors.emplace_back(initDesc(ChannelsFirst)); + if (isBlockedApplicable) + supportedPrimitiveDescriptors.emplace_back(initDesc(Blocked)); + supportedPrimitiveDescriptors.emplace_back(initDesc(Planar)); } void MKLDNNEltwiseNode::createPrimitive() { - if (prim) - return; + auto config = getSelectedPrimitiveDescriptor()->getConfig(); - auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); - if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) - THROW_IE_EXCEPTION << "Destination memory didn't allocate."; - if (getSelectedPrimitiveDescriptor() == nullptr) - THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set."; + auto initDims = [this, config](size_t maxInputSize) { + size_t inputNum = getParentEdges().size(); - std::vector srcs_pd; - std::vector srcs_p; - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr(); - if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) { - auto parent = getParentEdgeAt(i)->getParent(); - THROW_IE_EXCEPTION << "Source memory from " << parent->getName() << " didn't allocate."; + dims_in.resize(inputNum); + for (int i = 0; i < inputNum; i++) { + dims_in[i].resize(maxInputSize, 1); } - if (op == EltwiseLayer::Sum) { - srcs_pd.push_back(srcMemPtr->GetPrimitiveDescriptor()); - srcs_p.emplace_back(srcMemPtr->GetPrimitive()); + dims_out.resize(maxInputSize, 1); + + std::vector order(maxInputSize); + auto outOrder = config.outConfs[0].desc.getBlockingDesc().getOrder(); + for (size_t i = 0; i < order.size(); i++) { + if (i < order.size() - outOrder.size()) + order[i] = i; + else + order[i] = outOrder[i - (order.size() - outOrder.size())] + (order.size() - outOrder.size()); } - } - if (op == EltwiseLayer::Sum && !broadcast && fusedWith.empty()) { - try { - auto primitive_desc = mkldnn::sum::primitive_desc(dstMemPtr->GetDescriptor(), sum_scales, srcs_pd); - prim = std::shared_ptr(new mkldnn::sum(primitive_desc, srcs_p, dstMemPtr->GetPrimitive())); - } catch (...) { - std::cerr << "Handle this problem correctly!" << std::endl; - prim = nullptr; + + size_t outRank = config.outConfs[0].desc.getBlockingDesc().getBlockDims().size(); + for (int i = 0; i < outRank; i++) { + dims_out[dims_out.size() - 1 - i] = config.outConfs[0].desc.getBlockingDesc().getBlockDims()[outRank - 1 - i]; } - } -} -void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() { - auto selected_pd = getSelectedPrimitiveDescriptor(); - if (selected_pd == nullptr) - THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set."; - auto config = selected_pd->getConfig(); - if (isInitConfig(config)) - return; + for (int i = 0; i < inputNum; i++) { + size_t inRank = config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); - MKLDNNNode::initOptimalPrimitiveDescriptor(); + // WA to normalize blocked and planar layouts + auto inOrder = config.inConfs[i].desc.getBlockingDesc().getOrder(); + size_t startOff = outOrder.size() != config.outConfs[0].desc.getDims().size() && + outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] ? 1 : 0; - auto* selectedPD = getSelectedPrimitiveDescriptor(); - if (!selectedPD) { - return; - } + for (int j = 0; j < inRank; j++) { + dims_in[i][dims_in[i].size() - 1 - j - startOff] = config.inConfs[i].desc.getBlockingDesc().getBlockDims()[inRank - 1 - j]; + } + } - auto& selectedConfig = getSelectedPrimitiveDescriptor()->getConfig(); - for (size_t i = 1; i < selectedConfig.inConfs.size(); i++) { - if (selectedConfig.inConfs[0].desc.getPrecision() != selectedConfig.inConfs[i].desc.getPrecision()) { - selectedConfig.inConfs[i].desc.setPrecision(selectedConfig.inConfs[0].desc.getPrecision()); + for (int i = 0; i < dims_in.size(); i++) { + for (int j = 0; j < dims_in[i].size(); j++) { + if (dims_in[i][j] != dims_out[j] && dims_in[i][j] != 1) + THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input/output dims configuration."; + } } - } -} + }; -void MKLDNNEltwiseNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) { - mkldnn::post_ops ops; + auto initOffsets = [this, config](size_t maxInputSize) { + size_t inputNum = getParentEdges().size(); - for (auto &node : fusedWith) { - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta()); + offsets_out.resize(maxInputSize, 1); + offset_out_calc(offsets_out, dims_out); + for (int j = 0; j < maxInputSize; j++) { + offsets_out[j] *= config.outConfs[0].desc.getPrecision().size(); + } - continue; + offsets_in.resize(inputNum); + for (int i = 0; i < inputNum; i++) { + offsets_in[i].resize(maxInputSize, 1); + offset_in_calc(offsets_in[i], dims_in[i], dims_out); + for (int j = 0; j < maxInputSize; j++) { + offsets_in[i][j] *= config.inConfs[i].desc.getPrecision().size(); + } } - auto* quantizeNode = dynamic_cast(node.get()); - if (quantizeNode) { - quantizeNode->appendPostOps(ops); - continue; + start_offset_in.resize(inputNum); + for (size_t i = 0; i < inputNum; i++) { + start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding * + MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getParentEdgeAt(i)->getMemory().GetDescriptor().data.data_type)); } + start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding * + MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getChildEdgeAt(0)->getMemory().GetDescriptor().data.data_type)); + }; - THROW_IE_EXCEPTION << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented"; - } + auto collapseLastDims = [](std::vector& dims, int dimsToCollapse) { + for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { + dims[dims.size() - 1] *= dims[i]; + } - attr.set_post_ops(ops); -} + for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { + dims[i] = dims[i - dimsToCollapse]; + } -void MKLDNNEltwiseNode::dims_calc(int *dims, const MKLDNNDims &edge_dims, bool channels_first = false) { - for (int i = 0; i < 5; i++) - dims[i] = 1; - int ndims = edge_dims.ndims(); - if (ndims > 5) { - THROW_IE_EXCEPTION << "ndims should be less then 5"; - } - for (int i = 0; i < ndims; i++) { - dims[4 - i] = edge_dims[ndims - 1 - i]; - } - if (edge_dims.ndims() && !(broadcast && edge_dims[0] == getChildEdgeAt(0)->getDims()[0])) - dims[batch_dim] = std::min(dims[batch_dim], batchToProcess()); - - if (channels_first) { - auto ch_idx = 5 - ndims + 1; - auto ch = dims[ch_idx]; - for (int i = ch_idx; i < 4; i++) { - dims[i] = dims[i + 1]; + for (int i = dimsToCollapse - 1; i >= 0; i--) { + dims[i] = 1; } - dims[4] = ch; - } -} + }; -void MKLDNNEltwiseNode::offset_out_calc(int *offset, int *dims) { - int k = 1; - for (int i = 4; i >= 0; i--) { - offset[i] = k; - k *= dims[i]; - } -} + auto collapseLastOffsets = [](std::vector& dims, int dimsToCollapse) { + for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { + if (dims[dims.size() - 1] > 0 || dims[i] > 0) + dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * std::max(dims[i], static_cast(1)); + else + dims[dims.size() - 1] *= dims[i]; + } -void MKLDNNEltwiseNode::offset_in_calc(int *offset, int *dims_in, int *dims_out) { - int k = 1; - for (int i = 4; i >= 0; i--) { - offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; - k *= dims_in[i]; - } -} + for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { + dims[i] = dims[i - dimsToCollapse]; + } -// Intel C++ Compiler 18.0 for Windows contains bug that doesn't allow to use templates to generate eltwise implementations -// and to avoid all copypaste below -template void MKLDNNEltwiseNode::eltwise_add( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] + src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] + src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in]; - } - }); -#endif + for (int i = dimsToCollapse - 1; i >= 0; i--) { + dims[i] = 0; } - } -} + }; -template void MKLDNNEltwiseNode::eltwise_prod( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] * src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] * src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] * src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] * src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in]; - } - } - } - } - } -#else - parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in]; - }); -#endif - } - } -} + tensorRank = std::max(static_cast(optimalTensorRank), config.outConfs[0].desc.getBlockingDesc().getBlockDims().size()); + initDims(tensorRank); -template void MKLDNNEltwiseNode::eltwise_max( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]); - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]); - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); - } - } + auto outOrder = config.outConfs[0].desc.getBlockingDesc().getOrder(); + size_t oc_size = 0; + offsets_oc.resize(tensorRank, 0); + if (isFusedWith(Quantize)) { + size_t offset_oc = 1; + for (int i = outOrder.size() - 1; i >= 0; i--) { + if (outOrder[i] == 1) { + int oc_dim_idx = i + (tensorRank - outOrder.size()); + offsets_oc[oc_dim_idx] = offset_oc; + offset_oc *= dims_out[oc_dim_idx]; } } + oc_size = offsets_oc[dims_out.size() - 1] != 0 ? dims_out[dims_out.size() - 1] : 1; } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]); - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]); - } - }); -#endif - } - } -} -template void MKLDNNEltwiseNode::eltwise_sub( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] - src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] - src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] - src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] - src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1]; - } - } - } - } + fullWorkAmount = 1; + for (int i = 0; i < dims_out.size(); i++) { + fullWorkAmount *= dims_out[i]; } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in]; - } - } + + size_t minimalConcurrency = parallel_get_max_threads(); + size_t minimalJitWorkAmount = 256; + size_t currentJitWorkAmount = dims_out[dims_out.size() - 1]; + int collapsedDims = 0; + if (canUseOptimizedImpl) { + bool hasDifferentDims = false; + while (currentJitWorkAmount < minimalJitWorkAmount) { + if (dims_out.size() - collapsedDims - 2 < 0) + break; + + for (int j = 1; j < dims_in.size(); j++) { + if (dims_in[j][dims_in[j].size() - 1] != dims_in[0][dims_in[0].size() - 1]) { + hasDifferentDims = true; } } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in]; - } - }); -#endif - } - } -} -template void MKLDNNEltwiseNode::eltwise_min( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]); - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]); + if (oc_size > 1 && oc_size != dims_in[0][dims_in[0].size() - 1]) { + hasDifferentDims = true; } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]); - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); + + bool canCollapse = true; + for (int i = 0; i < dims_in.size(); i++) { + if (dims_in[i][dims_in[i].size() - 2] != 1) { + if (dims_in[i][dims_in[i].size() - 1] == 1) { + canCollapse = false; + break; } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]); - } + + if (hasDifferentDims) { + canCollapse = false; + break; } } } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]); - } - }); -#endif - } - } -} -template void MKLDNNEltwiseNode::eltwise_div( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] / src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] / src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] / src_ptr[i]; + if (!canCollapse) { + break; } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] / src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1]; - } + + size_t nextJitWorkAmount = currentJitWorkAmount * dims_out[dims_out.size() - 2]; + if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { + currentJitWorkAmount = nextJitWorkAmount; + collapsedDims++; + + for (int i = 0; i < dims_in.size(); i++) { + collapseLastDims(dims_in[i], 1); } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in]; - } - } + collapseLastDims(dims_out, 1); + + if (isFusedWith(Quantize)) { + collapseLastOffsets(offsets_oc, 1); } + } else { + break; } } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in]; - } - }); -#endif - } } -} -template void MKLDNNEltwiseNode::eltwise_squared_diff( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]); + isDynBatchEnabled = config.dynBatchSupport; + batchDimIdx = tensorRank - config.outConfs[0].desc.getBlockingDesc().getBlockDims().size() + collapsedDims; + schedulerWorkAmount = fullWorkAmount / dims_out[dims_out.size() - 1]; + + initOffsets(tensorRank); + + jep.inputs_number = config.inConfs.size(); + jep.input_size = tensorRank; + + for (int i = 0; i < config.inConfs.size(); i++) { + jep.src_size[i] = dims_in[i][dims_in[i].size() - 1]; + jep.src_prc[i] = config.inConfs[i].desc.getPrecision(); } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]); - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]); - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]); - } - } - } - } + jep.dst_size = dims_out[dims_out.size() - 1]; + jep.dst_prc = config.outConfs[0].desc.getPrecision(); + + for (int i = 0; i < config.inConfs.size(); i++) { + jep.src_offsets[i] = offsets_in[i]; } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]); - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]); - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]); - } - }); -#endif - } + jep.dst_offsets = offsets_out; + + jep.oc_size = oc_size; + + if (mayiuse(cpu::avx512_common)) { + eltwise_kernel.reset(new jit_uni_eltwise_generic(jep, *this)); + } else if (mayiuse(cpu::avx2)) { + eltwise_kernel.reset(new jit_uni_eltwise_generic(jep, *this)); + } else if (mayiuse(cpu::sse42)) { + eltwise_kernel.reset(new jit_uni_eltwise_generic(jep, *this)); } } -template void MKLDNNEltwiseNode::eltwise_floor_mod( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in0] / src1_ptr[index_in1] * src1_ptr[index_in1]; +void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() { + for (auto& type : getPrimitivesPriority()) { + int selectedPrimitive = -1; + int equalsFormatCount = -1; + for (size_t i = 0; i < getSupportedPrimitiveDescriptors().size(); i++) { + impl_desc_type supportedType = getSupportedPrimitiveDescriptors()[i].getImplementationType(); + if (type == supportedType) { + int equalsLocalFormatCount = 0; + if (getSupportedPrimitiveDescriptors()[i].getConfig().inConfs.size() > getParentEdges().size()) + continue; + for (size_t j = 0; j < getSupportedPrimitiveDescriptors()[i].getConfig().inConfs.size(); j++) { + auto parentEdge = getParentEdgeAt(j); + auto parentPtr = parentEdge->getParent(); + // We don't take into account constant edges since reorders on them will be executed on load network stage + if (j > 0 && parentPtr->isConstant()) { + equalsLocalFormatCount++; + continue; } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in0] / src1_ptr[index_in1] * src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_out] / src_ptr[index_in] * src_ptr[index_in]; + + auto parent_spd = parentPtr->getSelectedPrimitiveDescriptor(); + + if (parent_spd != nullptr && !parent_spd->getConfig().outConfs.empty()) { + int inNum = parentEdge->getInputNum(); + if (inNum < 0 || inNum >= parent_spd->getConfig().outConfs.size()) { + inNum = 0; + } + if (MKLDNNExtensionUtils::initTensorsAreEqual( + getSupportedPrimitiveDescriptors()[i].getConfig().inConfs[j].desc, + parent_spd->getConfig().outConfs[inNum].desc)) { + equalsLocalFormatCount++; } } } + if (equalsLocalFormatCount > equalsFormatCount) { + equalsFormatCount = equalsLocalFormatCount; + selectedPrimitive = static_cast(i); + } } } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_out] / src_ptr[index_in] * src_ptr[index_in]; - } - }); -#endif + if (selectedPrimitive >= 0) { + selectPrimitiveDescriptorByIndex(selectedPrimitive); + return; } } + + if (getSupportedPrimitiveDescriptors().empty()) + THROW_IE_EXCEPTION << "Supported primitive descriptors list is empty for node: " << getName(); + // fallback. If there are no primitives from priority list just select a first + selectPrimitiveDescriptorByIndex(0); } -template void MKLDNNEltwiseNode::eltwise_pow( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]); - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]); - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]); - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]); - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]); - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]); - } - }); -#endif - } +void MKLDNNEltwiseNode::offset_out_calc(std::vector& offset, std::vector& dims) { + int k = 1; + for (int i = offset.size() - 1; i >= 0; i--) { + offset[i] = k; + k *= dims[i]; } } -template void MKLDNNEltwiseNode::eltwise_equal( - const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] == src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] == src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] == src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] == src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in]; - } - }); -#endif - } +void MKLDNNEltwiseNode::offset_in_calc(std::vector& offset, std::vector& dims_in, std::vector& dims_out) { + int k = 1; + for (int i = offset.size() - 1; i >= 0; i--) { + offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; + k *= dims_in[i]; } } -template void MKLDNNEltwiseNode::eltwise_not_equal( - const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] != src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] != src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] != src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] != src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1]; - } - } +void MKLDNNEltwiseNode::executeOptimized6D(const std::vector& src_ptrs, uint8_t *dst_ptr) { + size_t inputNum = src_ptrs.size(); + + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], + [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + // TODO: reimplement initializer via jit approach + size_t index_in[MAX_ELTWISE_INPUTS] = {0}; + for (int i = 0; i < inputNum; i++) { + index_in[i] = i0 * offsets_in[i][0] + i1 * offsets_in[i][1] + i2 * offsets_in[i][2] + + i3 * offsets_in[i][3] + i4 * offsets_in[i][4]; } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1]; + size_t index_out = i0 * offsets_out[0] + i1 * offsets_out[1] + i2 * offsets_out[2] + + i3 * offsets_out[3] + i4 * offsets_out[4]; + + auto arg = jit_eltwise_call_args(); + for (int i = 0; i < inputNum; i++) { + arg.src_ptr[i] = src_ptrs[i] + index_in[i]; } + arg.dst = dst_ptr + index_out; + arg.work_amount = static_cast(dims_out[dims_out.size() - 1]); + arg.oc_off = (i0 * offsets_oc[0] + i1 * offsets_oc[1] + i2 * offsets_oc[2] + + i3 * offsets_oc[3] + i4 * offsets_oc[4]) * sizeof(float); + + (*eltwise_kernel)(&arg); }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in]; - } - }); -#endif - } - } } -template void MKLDNNEltwiseNode::eltwise_less( - const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] < src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] < src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] < src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] < src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1]; +void MKLDNNEltwiseNode::executeOptimizedGeneric(const std::vector& src_ptrs, uint8_t *dst_ptr) { + size_t inputNum = src_ptrs.size(); + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(schedulerWorkAmount, nthr, ithr, start, end); + + std::vector counters(dims_out.size() - 1, 0); + + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = dims_out.size() - 2; j >= 0; j--) { + counters[j] = tmp % dims_out[j]; + tmp /= dims_out[j]; } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in]; - } - } + + size_t index_in[MAX_ELTWISE_INPUTS] = {0}; + for (int i = 0; i < inputNum; i++) { + index_in[i] = 0; + for (int j = 0; j < counters.size(); j++) { + index_in[i] += counters[j] * offsets_in[i][j]; } } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in]; - } - }); -#endif - } - } -} -template void MKLDNNEltwiseNode::eltwise_less_equal( - const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] <= src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] <= src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] <= src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] <= src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1]; - } - } + size_t index_out = 0; + for (int j = 0; j < counters.size(); j++) { + index_out += counters[j] * offsets_out[j]; } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1]; + + auto arg = jit_eltwise_call_args(); + for (int i = 0; i < inputNum; i++) { + arg.src_ptr[i] = src_ptrs[i] + index_in[i]; } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in]; - } - } - } + arg.dst = dst_ptr + index_out; + arg.work_amount = static_cast(dims_out[dims_out.size() - 1]); + + arg.oc_off = 0; + for (int j = 0; j < counters.size(); j++) { + arg.oc_off += counters[j] * offsets_oc[j] * sizeof(float); } + + (*eltwise_kernel)(&arg); } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in]; - } - }); -#endif - } - } + }); } -template void MKLDNNEltwiseNode::eltwise_greater( - const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] > src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] > src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] > src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] > src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1]; - } - } - } - } +void MKLDNNEltwiseNode::executeReference(const std::vector& src_ptrs, uint8_t *dst_ptr) { + size_t inputNum = src_ptrs.size(); + + std::shared_ptr ref_eltwise_injector = nullptr; + if (eltwiseAlgorithm != mkldnn::algorithm_undef) { + ref_eltwise_injector = std::make_shared(static_cast(eltwiseAlgorithm), alpha, beta); } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(fullWorkAmount, nthr, ithr, start, end); + + std::vector counters(dims_out.size(), 0); + + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) { + counters[j] = tmp % dims_out[j]; + tmp /= dims_out[j]; } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in]; - } - } + + size_t index_in[MAX_ELTWISE_INPUTS] = {0}; + for (int i = 0; i < inputNum; i++) { + index_in[i] = 0; + for (int j = 0; j < counters.size(); j++) { + index_in[i] += counters[j] * offsets_in[i][j]; } } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in]; - } - }); -#endif - } - } + + size_t index_out = 0; + for (int j = 0; j < counters.size(); j++) { + index_out += counters[j] * offsets_out[j]; + } + + std::vector src_f(inputNum); + for (int i = 0; i < inputNum; i++) { + src_f[i] = reinterpret_cast(src_ptrs[i] + index_in[i])[0]; + } + float* dst_ptr_f = reinterpret_cast(dst_ptr + index_out); + + switch (getOpType()) { + case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt: + case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid: + *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); break; + case Add: *dst_ptr_f = src_f[0] + src_f[1]; break; + case MulAdd: *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break; + case Subtract: *dst_ptr_f = src_f[0] - src_f[1]; break; + case Multiply: *dst_ptr_f = src_f[0] * src_f[1]; break; + case Divide: *dst_ptr_f = src_f[0] / src_f[1]; break; + case FloorMod: *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break; + case Mod: *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break; + case Maximum: *dst_ptr_f = std::max(src_f[0], src_f[1]); break; + case Minimum: *dst_ptr_f = std::min(src_f[0], src_f[1]); break; + case SquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break; + case PowerDynamic: *dst_ptr_f = powf(src_f[0], src_f[1]); break; + case Equal: *dst_ptr_f = src_f[0] == src_f[1]; break; + case NotEqual: *dst_ptr_f = src_f[0] != src_f[1]; break; + case Greater: *dst_ptr_f = src_f[0] > src_f[1]; break; + case GreaterEqual: *dst_ptr_f = src_f[0] >= src_f[1]; break; + case Less: *dst_ptr_f = src_f[0] < src_f[1]; break; + case LessEqual: *dst_ptr_f = src_f[0] <= src_f[1]; break; + case LogicalAnd: *dst_ptr_f = src_f[0] && src_f[1]; break; + case LogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break; + case LogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break; + case LogicalNot: *dst_ptr_f = !src_f[0]; break; + case PowerStatic: *dst_ptr_f = powf(beta * src_f[0] + gamma, alpha); break; + case Prelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : src_f[0] * src_f[1]; break; + default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node with name `" << getName() << "`"; + } + } + }); } -template void MKLDNNEltwiseNode::eltwise_greater_equal( - const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] >= src1_ptr[i]; +void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { + size_t inputNum = getParentEdges().size(); + + std::vector src_ptrs(inputNum); + for (int i = 0; i < inputNum; i++) { + src_ptrs[i] = reinterpret_cast(getParentEdgeAt(i)->getMemory().GetData()) + start_offset_in[i]; } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] >= src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] >= src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] >= src_ptr[i]; - }); -#endif + uint8_t *dst_ptr = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + start_offset_out; + + // In general case we need to recompute offsets as well but currently all supported layout assumes batch to be outermost dimension + if (isDynBatchEnabled) + dims_out[batchDimIdx] = static_cast(batchToProcess()); + + if (eltwise_kernel) { + if (tensorRank == optimalTensorRank) { + executeOptimized6D(src_ptrs, dst_ptr); + } else { + executeOptimizedGeneric(src_ptrs, dst_ptr); } } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in]; - } - }); -#endif - } + executeReference(src_ptrs, dst_ptr); } } -template void MKLDNNEltwiseNode::eltwise_logical_and( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] && src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] && src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] && src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] && src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in]; - } - } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in]; - } - }); -#endif - } - } +bool MKLDNNEltwiseNode::created() const { + return getType() == Eltwise; } -template void MKLDNNEltwiseNode::eltwise_logical_or( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = src0_ptr[i] || src1_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = src0_ptr[i] || src1_ptr[i]; - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = dst_ptr[i] || src_ptr[i]; - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = dst_ptr[i] || src_ptr[i]; - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1]; - } - } - } - } +bool MKLDNNEltwiseNode::canBeInPlace() const { + if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Input) { + return false; } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1]; - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in]; - } - } - } + + for (auto& parentEdge : getParentEdges()) { + auto parent = parentEdge.lock()->getParent(); + if (parent->getChildEdges().size() != 1) + return false; + + // WA to prevent memory corruption caused by inplace feature + if (parent->getType() == Concatenation) { + for (auto& parentParentEdge : parent->getParentEdges()) { + auto parentParent = parentParentEdge.lock()->getParent(); + if (parentParent->getChildEdges().size() != 1) + return false; } } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in]; - } - }); -#endif - } } + + return getParentEdgesAtPort(0)[0].get()->getDims() == getChildEdgesAtPort(0)[0].get()->getDims(); } -template void MKLDNNEltwiseNode::eltwise_logical_xor( - const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { - if (!broadcast) { -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]); - }); -#endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - for (size_t i = 0; i < dst_data_size; i++) { - dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]); - } -#else - parallel_for(dst_data_size, [&](size_t i) { - dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]); - }); -#endif - } - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims); - dims_calc(dims_in0, parent0_edge_dims); - dims_calc(dims_in1, parent1_edge_dims); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]); - } +void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) { + switch (getAlgorithm()) { + case mkldnn::eltwise_relu: + case mkldnn::eltwise_tanh: + case mkldnn::eltwise_elu: + case mkldnn::eltwise_square: + case mkldnn::eltwise_abs: + case mkldnn::eltwise_sqrt: + case mkldnn::eltwise_linear: + case mkldnn::eltwise_bounded_relu: + case mkldnn::eltwise_soft_relu: + case mkldnn::eltwise_logistic: + case mkldnn::eltwise_exp: + case mkldnn::eltwise_gelu: + case mkldnn::eltwise_clamp: + case mkldnn::eltwise_swish: + case mkldnn::eltwise_hswish: + case mkldnn::eltwise_mish: + case mkldnn::eltwise_hsigmoid: + ops.append_eltwise(1.0, getAlgorithm(), getAlpha(), getBeta()); + break; + case mkldnn::depthwise_scale_shift: + case mkldnn::depthwise_prelu: + if (scales.empty() && shifts.empty()) { + size_t bufferSize = static_cast(outDims[0][outDims[0].size() > 1 ? 1 : 0]); + size_t bufferSizeAligned = rnd_up(bufferSize, 16); + + Blob::Ptr scalesBlob = getCnnLayer()->blobs["weights"]; + if (scalesBlob == nullptr) + THROW_IE_EXCEPTION << "Cannot get weights blob in Eltwise node with name `" << getName() << "`"; + scales.resize(bufferSizeAligned, 0); + const float *scalesBufferPtr = scalesBlob->buffer().as(); + for (int i = 0; i < bufferSize; i++) { + scales[i] = scalesBufferPtr[scalesBlob->size() == 1 ? 0 : i]; } - } - } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]); - } - }); -#endif - for (size_t n = 2; n < getParentEdges().size(); n++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + - getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); - dims_calc(dims_in1, parent_edge_dims); - offset_in_calc(offset_in1, dims_in1, dims_out); - -#ifdef _WIN32 - for (size_t i0 = 0; i0 < dims_out[0]; i0++) { - for (size_t i1 = 0; i1 < dims_out[1]; i1++) { - for (size_t i2 = 0; i2 < dims_out[2]; i2++) { - for (size_t i3 = 0; i3 < dims_out[3]; i3++) { - for (size_t i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]); - } + + Blob::Ptr shiftsBlob = getCnnLayer()->blobs["biases"]; + if (shiftsBlob != nullptr) { + shifts.resize(bufferSizeAligned, 0); + const float *shiftsBufferPtr = shiftsBlob->buffer().as(); + for (int i = 0; i < bufferSize; i++) { + shifts[i] = shiftsBufferPtr[shiftsBlob->size() == 1 ? 0 : i]; } } } - } -#else - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - for (int i4 = 0; i4 < dims_out[4]; i4++) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; - size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; - dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]); - } - }); -#endif - } - } -} - -template void MKLDNNEltwiseNode::ref_eltwise2(int in0, int in1) { - IE_ASSERT(getParentEdges().size() > 1); - - auto& srcMemory0 = getParentEdgeAt(in0)->getMemory(); - auto& srcMemory1 = getParentEdgeAt(in1)->getMemory(); - const T0 *src0_ptr = reinterpret_cast(srcMemory0.GetData()) + - srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding; - const T1 *src1_ptr = reinterpret_cast(srcMemory1.GetData()) + - srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding; - T2 *dst_ptr = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + - getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess(); - - switch (op) { - case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node"; - } -} - -template void MKLDNNEltwiseNode::ref_eltwise(int in0, int in1) { - IE_ASSERT(getParentEdges().size() > 1); - - auto& srcMemory0 = getParentEdgeAt(in0)->getMemory(); - auto& srcMemory1 = getParentEdgeAt(in1)->getMemory(); - const T0 *src0_ptr = reinterpret_cast(srcMemory0.GetData()) + - srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding; - const T1 *src1_ptr = reinterpret_cast(srcMemory1.GetData()) + - srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding; - T0 *dst_ptr = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + - getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - - const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess(); - - switch (op) { - case EltwiseLayer::eOperation::Sum: eltwise_add(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Prod: eltwise_prod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Max: eltwise_max(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Sub: eltwise_sub(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Min: eltwise_min(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Div: eltwise_div(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Squared_diff: eltwise_squared_diff(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Floor_mod: eltwise_floor_mod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Pow: eltwise_pow(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Logical_AND: eltwise_logical_and(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Logical_OR: eltwise_logical_or(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - case EltwiseLayer::eOperation::Logical_XOR: eltwise_logical_xor(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; - default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node"; - } -} -void MKLDNNEltwiseNode::jit_eltwise_fq() { - auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); - auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); - auto& dstMemory = getChildEdgeAt(0)->getMemory(); - - const uint8_t *src0_ptr = reinterpret_cast(srcMemory0.GetData()) + - srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding * - MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemory0.GetDescriptor().data.data_type)); - const uint8_t *src1_ptr = reinterpret_cast(srcMemory1.GetData()) + - srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding * - MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemory1.GetDescriptor().data.data_type)); - uint8_t *dst_ptr = reinterpret_cast(dstMemory.GetData()) + - dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding * - MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemory.GetDescriptor().data.data_type)); - - if (!broadcast) { - auto& dims = getParentEdgeAt(0)->getDims(); - - int N = batchToProcess(); - int C = dims[1]; - int D = dims.ndims() > 4 ? dims[2] : 1; - int H = dims.ndims() > 2 ? dims[dims.ndims() - 2] : 1; - int W = dims.ndims() > 3 ? dims[dims.ndims() - 1] : 1; - - parallel_for4d(N, D, H, W, [&](int n, int d, int h, int w) { - size_t off = n * D * H * W * C + d * H * W * C + h * W * C + w * C; - - auto arg = jit_eltwise_fq_call_args(); - arg.src0 = src0_ptr + off * jep.src0_data_size; - arg.src1 = src1_ptr + off * jep.src1_data_size; - arg.dst = dst_ptr + off * jep.dst_data_size; - arg.work_amount = static_cast(C); - - (*eltiwse_fq_kernel)(&arg); - }); - } else { - int dims_out[5], dims_in0[5], dims_in1[5]; - int offset_out[5], offset_in0[5], offset_in1[5]; - auto& child_edge_dims = getChildEdgeAt(0)->getDims(); - auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); - auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); - dims_calc(dims_out, child_edge_dims, true); - dims_calc(dims_in0, parent0_edge_dims, true); - dims_calc(dims_in1, parent1_edge_dims, true); - offset_out_calc(offset_out, dims_out); - offset_in_calc(offset_in0, dims_in0, dims_out); - offset_in_calc(offset_in1, dims_in1, dims_out); - - parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) { - size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3]; - size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3]; - size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3]; - - auto arg = jit_eltwise_fq_call_args(); - arg.src0 = src0_ptr + index_in0 * jep.src0_data_size; - arg.src1 = src1_ptr + index_in1 * jep.src1_data_size; - arg.dst = dst_ptr + index_out * jep.dst_data_size; - arg.work_amount = static_cast(dims_out[4]); - - (*eltiwse_fq_kernel)(&arg); - }); + ops.append_depthwise(getAlgorithm(), &scales[0], shifts.empty() ? nullptr : &shifts[0]); + break; + default: THROW_IE_EXCEPTION << "Appending Eltwise node with name `" << getName() << "` as post operation is not supported"; } } -void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { - if (prim) { - MKLDNNNode::execute(strm); - } else { - if (op == EltwiseLayer::Floor_mod) { - for (size_t i = 0; i < getParentEdges().size(); i++) - if (getParentEdgeAt(i)->getDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of inputs"; - if (getChildEdgeAt(0)->getDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of output"; - } - - if (getParentEdges().size() > 2) { - Precision pi = getParentEdgeAt(0)->getDesc().getPrecision(); - Precision po = getChildEdgeAt(0)->getDesc().getPrecision(); - for (int i = 1; i < getParentEdges().size(); i++) { - if (getParentEdgeAt(i)->getDesc().getPrecision() != pi) - THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs must have same precision"; +bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const { + auto isOneOf = [](EltwiseOpType alg, std::vector algs) { + for (auto a : algs) { + if (alg == a) { + return true; } - if (pi != po) { - THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs and output must have same precision"; - } - if (pi == Precision::FP32) - ref_eltwise(0, 1); - else if (pi == Precision::I32) - ref_eltwise(0, 1); - else if (pi == Precision::I8) - ref_eltwise(0, 1); - else if (pi == Precision::U8) - ref_eltwise(0, 1); - else - THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, only FP32, I32, I8, U8 are supported"; - return; } + return false; + }; - Precision pi0 = getParentEdgeAt(0)->getDesc().getPrecision(); - Precision pi1 = getParentEdgeAt(1)->getDesc().getPrecision(); - Precision po = getChildEdgeAt(0)->getDesc().getPrecision(); + if (!mayiuse(cpu::sse42)) + return false; - IE_ASSERT(getParentEdges().size() > 1); + // FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number + size_t addedInputEdgesNum = node->getType() != Quantize ? (node->getParentEdges().size() - 1) : 0; + if (getParentEdges().size() + addedInputEdgesNum > MAX_ELTWISE_INPUTS) + return false; - if (!fusedWith.empty()) { - jit_eltwise_fq(); - } else { - // Input and output types for eltwise compare operations can be different - bool is_eltwise_compare_node = (op == EltwiseLayer::Equal || op == EltwiseLayer::Not_equal || - op == EltwiseLayer::Greater || op == EltwiseLayer::Greater_equal || - op == EltwiseLayer::Less || op == EltwiseLayer::Less_equal); - - if (po == Precision::FP32 && pi0 == po && pi1 == po) { - ref_eltwise(0, 1); - } else if (po == Precision::FP32 && pi0 == po && pi1 == Precision::I8) { - ref_eltwise(0, 1); - } else if (po == Precision::FP32 && pi1 == po && pi0 == Precision::I8) { - ref_eltwise(1, 0); - } else if (po == Precision::FP32 && pi0 == po && pi1 == Precision::U8) { - ref_eltwise(0, 1); - } else if (po == Precision::FP32 && pi1 == po && pi0 == Precision::U8) { - ref_eltwise(1, 0); - } else if (po == Precision::I8 && pi0 == po && pi1 == po) { - ref_eltwise(0, 1); - } else if (po == Precision::I8 && pi0 == po && pi1 == Precision::U8) { - ref_eltwise(0, 1); - } else if (po == Precision::I8 && pi1 == po && pi0 == Precision::U8) { - ref_eltwise(1, 0); - } else if (po == Precision::I32 && pi0 == po && pi1 == po) { - ref_eltwise(0, 1); - } else if (po == Precision::U8 && pi0 == Precision::I32 && pi0 == pi1 && is_eltwise_compare_node) { - ref_eltwise2(0, 1); - } else if (po == Precision::U8 && pi0 == Precision::FP32 && pi0 == pi1 && is_eltwise_compare_node) { - ref_eltwise2(0, 1); - } else { - THROW_IE_EXCEPTION << "Eltwise node with unsupported combination of input and output types"; + if (node->getType() == Eltwise) { + auto eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode->getParentEdgesAtPort(0)[0]->getParent().get() != this) { + // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port. + if (isOneOf(eltwiseNode->getOpType(), {Subtract, Divide, FloorMod, Mod, PowerDynamic, Greater, GreaterEqual, Less, LessEqual})) { + return false; } - } - } -} - -bool MKLDNNEltwiseNode::created() const { - return getType() == Eltwise; -} -bool MKLDNNEltwiseNode::canBeInPlace() const { - size_t inPlaceWithParent = getParentEdges().size(); - for (size_t i = 0; i < inPlaceWithParent; i++) { - auto parentEdge = getParentEdgeAt(i); - if (!parentEdge->getParent()->isConstant() && - parentEdge->getParent()->getChildEdges().size() == 1) { - inPlaceWithParent = i; - break; - } - } - // This is WA for MKLDNN implementation - if (inPlaceWithParent != 0) - return false; - MKLDNNDims dims = getParentEdgeAt(0)->getDims(); - for (size_t cIdx = 0; cIdx < getChildEdges().size(); cIdx++) { - if (getChildEdgeAt(cIdx)->getDims() != dims) { - return false; + // Limitation: inputs precision definition inside Eltwise node assumes fusing is applied for 0-th port, + // otherwise we need identical precision on all inputs of fused node + for (int i = 1; i < eltwiseNode->getCnnLayer()->insData.size(); i++) { + if (eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision() != eltwiseNode->getCnnLayer()->insData[i].lock()->getPrecision()) { + return false; + } + } } + + return true; } - // Broadcast mode is complex for inplace usage - // So will disable it - if (broadcast) return false; + if (node->getType() == Quantize) { + auto *quantizeNode = dynamic_cast(node.get()); + if (quantizeNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); + return !quantizeNode->isBinarization(); + } - return true; + return false; } + REG_MKLDNN_PRIM_FOR(MKLDNNEltwiseNode, Eltwise); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index 15b13c1..9b003ca 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -8,45 +8,98 @@ #include #include #include -#include #include +#include namespace MKLDNNPlugin { -struct jit_eltwise_fq_params { - int src0_step; - int src1_step; - int dst_step; - mkldnn::memory::data_type src0_dt; - mkldnn::memory::data_type src1_dt; - mkldnn::memory::data_type dst_dt; - int src0_data_size; - int src1_data_size; - int dst_data_size; - - InferenceEngine::EltwiseLayer::eOperation eltwise_op; +#define MAX_ELTWISE_INPUTS 7 + +enum EltwiseOpType { + Add = 0, + Multiply, + Subtract, + Divide, + FloorMod, + Mod, + Maximum, + Minimum, + SquaredDifference, + PowerDynamic, + PowerStatic, + MulAdd, + + Equal, + NotEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + LogicalAnd, + LogicalOr, + LogicalXor, + LogicalNot, + + Relu, + Gelu, + Elu, + Tanh, + Logistic, + Square, + Abs, + Sqrt, + Linear, + BoundedRelu, + SoftRelu, + Relu6, + Exp, + Clamp, + Swish, + Prelu, + Mish, + Hswish, + Hsigmoid }; -struct jit_eltwise_fq_call_args { - const void *src0; - const void *src1; +struct jit_eltwise_params { + size_t inputs_number; + size_t input_size; + + InferenceEngine::Precision src_prc[MAX_ELTWISE_INPUTS]; + InferenceEngine::Precision dst_prc; + + std::vector src_offsets[MAX_ELTWISE_INPUTS]; + std::vector dst_offsets; + + size_t src_size[MAX_ELTWISE_INPUTS]; + size_t dst_size; + size_t oc_size; +}; + +struct jit_eltwise_call_args { + const void *src_ptr[MAX_ELTWISE_INPUTS]; void *dst; + size_t work_amount; + size_t oc_off; }; -struct jit_uni_eltwise_fq_kernel { - void (*ker_)(const jit_eltwise_fq_call_args *); +class MKLDNNEltwiseNode; - void operator()(const jit_eltwise_fq_call_args *args) { +struct jit_uni_eltwise_kernel { + void (*ker_)(const jit_eltwise_call_args *); + + void operator()(const jit_eltwise_call_args *args) { assert(ker_); ker_(args); } - explicit jit_uni_eltwise_fq_kernel(jit_eltwise_fq_params jep, const mkldnn_primitive_attr &attr) : ker_(nullptr), jep_(jep), attr_(attr) {} - virtual ~jit_uni_eltwise_fq_kernel() {} + explicit jit_uni_eltwise_kernel(jit_eltwise_params jep, MKLDNNEltwiseNode& node) : ker_(nullptr), jep_(jep), eltwiseNode(node) {} + virtual ~jit_uni_eltwise_kernel() {} - jit_eltwise_fq_params jep_; - const mkldnn_primitive_attr &attr_; + jit_eltwise_params jep_; + MKLDNNEltwiseNode& eltwiseNode; }; class MKLDNNEltwiseNode : public MKLDNNNode { @@ -56,54 +109,66 @@ public: void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; + void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; void execute(mkldnn::stream strm) override; bool created() const override; bool canBeInPlace() const override; bool isSum(); - bool isUnitScales(); bool isWithBroadcast(); - void initOptimalPrimitiveDescriptor() override; + + bool canFuse(const MKLDNNNodePtr& node) const; + + size_t getOpInputsNum() const; + EltwiseOpType getOpType() const { return eltwiseOp; } + mkldnn::algorithm getAlgorithm() const { return eltwiseAlgorithm; } + + float getAlpha() const { return alpha; } + float getBeta() const { return beta; } + + void appendPostOps(mkldnn::post_ops& ops) override; private: - InferenceEngine::EltwiseLayer::eOperation op; - std::vector sum_scales; - bool broadcast = false; - int batch_dim = 5; - mkldnn::primitive_attr attr; - - std::shared_ptr eltiwse_fq_kernel; - jit_eltwise_fq_params jep; - - void jit_eltwise_fq(); - void setPostOps(mkldnn::primitive_attr &attr, bool initWeights); - - template void ref_eltwise(int in0, int in1); - template void ref_eltwise2(int in0, int in1); - void dims_calc(int *dims, const MKLDNNDims &edge_dims, bool channels_first); - void offset_out_calc(int *offset, int *dims); - void offset_in_calc(int *offset, int *dims_in, int *dims_out); - - template void eltwise_add(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_prod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_max(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_sub(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_min(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_div(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_squared_diff(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_floor_mod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_pow(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_logical_and(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_logical_or(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - template void eltwise_logical_xor(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); - - template void eltwise_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size); - template void eltwise_not_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size); - template void eltwise_less(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size); - template void eltwise_less_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size); - template void eltwise_greater(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size); - template void eltwise_greater_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size); + void init() override; + + EltwiseOpType eltwiseOp = Add; + mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm_undef; + + std::shared_ptr eltwise_kernel = nullptr; + jit_eltwise_params jep = {}; + + int optimalTensorRank = 6; + bool canUseOptimizedImpl = false; + bool isDynBatchEnabled = false; + size_t batchDimIdx = 0; + size_t tensorRank = 0; + size_t fullWorkAmount = 0; + size_t schedulerWorkAmount = 0; + std::vector> dims_in = {}; + std::vector> offsets_in = {}; + std::vector dims_out = {}; + std::vector offsets_out = {}; + std::vector start_offset_in = {}; + ptrdiff_t start_offset_out = 0; + std::vector offsets_oc = {}; + + float alpha = 0; + float beta = 0; + float gamma = 0; + + std::vector scales = {}; + std::vector shifts = {}; + + inline void executeOptimized6D(const std::vector& src_ptrs, uint8_t *dst_ptr); + inline void executeOptimizedGeneric(const std::vector& src_ptrs, uint8_t *dst_ptr); + inline void executeReference(const std::vector& src_ptrs, uint8_t *dst_ptr); + + void offset_out_calc(std::vector& offset, std::vector& dims); + void offset_in_calc(std::vector& offset, std::vector& dims_in, std::vector& dims_out); + + static InferenceEngine::details::caseless_map> initializers; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp index 8d0b13d..bcb97ca 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp @@ -3,8 +3,7 @@ // #include "mkldnn_fullyconnected_node.h" -#include "mkldnn_activation_node.h" -#include "mkldnn_depthwise_node.h" +#include "mkldnn_eltwise_node.h" #include "mkldnn_quantize_node.h" #include "desc_iterator.hpp" #include @@ -199,10 +198,10 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini continue; } - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode && (eltwiseNode->getOpType() == MulAdd || eltwiseNode->getOpType() == Prelu)) { if (initWeights) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); + auto* depthwiseLayer = reinterpret_cast(eltwiseNode->getCnnLayer().get()); int ndims = getParentEdgeAt(0)->getDims().ndims(); MKLDNNDims depthwiseDims({static_cast(rnd_up(ndims == 3 ? getChildEdgeAt(0)->getDims()[2] : getChildEdgeAt(0)->getDims()[1], 16))}); @@ -211,7 +210,7 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini PostOpsIntBlobMemory[blob_idx]->FillZero(); // In case ndims == 3 graph optimizer allows fusing only if all weights values are the same - if (depthwiseNode->isBroadcast() || ndims == 3) { + if (depthwiseLayer->blobs["weights"]->size() == 1 || ndims == 3) { float broadcastValue = static_cast(depthwiseLayer->_weights->buffer())[0]; for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; @@ -223,13 +222,13 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); } - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { + if (eltwiseNode->getAlgorithm() == depthwise_scale_shift) { PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); PostOpsIntBlobMemory[blob_idx + 1]->FillZero(); // In case ndims == 3 graph optimizer allows fusing only if all biases values are the same - if (depthwiseNode->isBroadcast() || ndims == 3) { + if (depthwiseLayer->blobs["biases"]->size() == 1 || ndims == 3) { float broadcastValue = static_cast(depthwiseLayer->_biases->buffer())[0]; for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; @@ -241,20 +240,20 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); } - ops.append_depthwise(depthwiseNode->getAlgorithm(), + ops.append_depthwise(eltwiseNode->getAlgorithm(), (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); blob_idx += 2; } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), + ops.append_depthwise(eltwiseNode->getAlgorithm(), (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), nullptr); blob_idx += 1; } } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), + ops.append_depthwise(eltwiseNode->getAlgorithm(), nullptr, nullptr); } @@ -262,11 +261,8 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini continue; } - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta()); - - continue; + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp index 5518956..de76d7d 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp @@ -5,9 +5,8 @@ #include "mkldnn_interpolate_node.h" #include "desc_iterator.hpp" #include "mkldnn_quantize_node.h" -#include "mkldnn_depthwise_node.h" -#include "mkldnn_activation_node.h" #include +#include "mkldnn_eltwise_node.h" #include #include #include @@ -1480,62 +1479,9 @@ void MKLDNNInterpolateNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe continue; } - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { - if (initWeights) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); - MKLDNNDims depthwiseDims({static_cast(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))}); - - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_weights->buffer(), - depthwiseLayer->_weights->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; - } - } - - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_biases->buffer(), - depthwiseLayer->_biases->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; - } - } - - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); - - blob_idx += 2; - } - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - nullptr, - nullptr); - } - - continue; - } - - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta()); - + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); continue; } @@ -2153,7 +2099,7 @@ inline int MKLDNNInterpolateNode::nearestRound(float originCoord, bool isDownsam } bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const { - auto isOneOf = [](mkldnn::algorithm alg, std::vector algs) { + auto isOneOf = [&](EltwiseOpType alg, std::vector algs) { for (auto a : algs) { if (alg == a) { return true; @@ -2170,22 +2116,16 @@ bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const { if (node->getType() == Quantize) { auto* quantizeNode = dynamic_cast(node.get()); if (quantizeNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName(); + THROW_IE_EXCEPTION << "Cannot get quantize node " << node->getName(); return !quantizeNode->isBinarization(); - } else if (node->getType() == Depthwise) { - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName(); - return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) || - (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu)); - } else if (node->getType() == Activation) { - auto* activationNode = dynamic_cast(node.get()); - if (activationNode == nullptr) - THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName(); - return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic, - eltwise_bounded_relu, eltwise_clamp, eltwise_tanh, eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid, - eltwise_linear, eltwise_abs, eltwise_square, eltwise_sqrt}); + } else if (node->getType() == Eltwise) { + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode == nullptr) + THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName(); + return isOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu, Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, + Tanh, Swish, Hswish, Mish, Hsigmoid, Linear, Abs, Square, Sqrt}); } + return false; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index a5199a8..625a5b2 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -5,9 +5,8 @@ #include "mkldnn_mvn_node.h" #include "desc_iterator.hpp" #include "mkldnn_quantize_node.h" -#include "mkldnn_depthwise_node.h" -#include "mkldnn_activation_node.h" #include +#include "mkldnn_eltwise_node.h" #include #include #include @@ -597,64 +596,9 @@ void MKLDNNMVNNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) { continue; } - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { - if (initWeights) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); - MKLDNNDims depthwiseDims({static_cast(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))}); - - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - PostOpsIntBlobMemory[blob_idx]->FillZero(); - - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_weights->buffer(), - depthwiseLayer->_weights->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; - } - } - - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->FillZero(); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_biases->buffer(), - depthwiseLayer->_biases->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; - } - } - - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); - - blob_idx += 2; - } - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - nullptr, - nullptr); - } - - continue; - } - - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta()); - + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); continue; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index fd59bb9..72f7570 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -3,8 +3,7 @@ // #include "mkldnn_quantize_node.h" -#include "mkldnn_depthwise_node.h" -#include "mkldnn_activation_node.h" +#include "mkldnn_eltwise_node.h" #include #include #include "ie_parallel.hpp" @@ -808,70 +807,9 @@ void MKLDNNNormalizeNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeig continue; } - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { - if (initWeights) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); - MKLDNNDims depthwiseDims({static_cast(rnd_up(getParentEdgeAt(0)->getDims()[1], 16))}); - - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - PostOpsIntBlobMemory[blob_idx]->FillZero(); - - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_weights->buffer(), - depthwiseLayer->_weights->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; - } - } - - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->FillZero(); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_biases->buffer(), - depthwiseLayer->_biases->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; - } - } - - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); - - blob_idx += 2; - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - nullptr); - - blob_idx += 1; - } - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - nullptr, - nullptr); - } - - continue; - } - - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta()); - + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); continue; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp deleted file mode 100644 index c2885b6..0000000 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "mkldnn_power_node.h" -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" - -using namespace mkldnn; -using namespace MKLDNNPlugin; -using namespace InferenceEngine; - -MKLDNNPowerNode::MKLDNNPowerNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) - : MKLDNNNode(layer, eng, cache), scale(1.0f), shift(1.0f), power(1.0f) {} - -void MKLDNNPowerNode::getSupportedDescriptors() { - auto * powerLayer = dynamic_cast(getCnnLayer().get()); - - if (powerLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot convert power layer."; - scale = powerLayer->scale; - power = powerLayer->power; - shift = powerLayer->offset; - - if (getParentEdges().size() != 1) - THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); - if (getChildEdges().empty()) - THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); -} - -void MKLDNNPowerNode::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) - return; - - InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; - auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - precision = getCnnLayer()->outData[0]->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; - auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - - InferenceEngine::LayerConfig config; - config.dynBatchSupport = true; - config.inConfs.resize(1); - config.outConfs.resize(1); - config.inConfs[0].inPlace = -1; - config.inConfs[0].constant = false; - config.outConfs[0].inPlace = -1; - config.outConfs[0].constant = false; - for (auto format : getAvailableFormatsForDims(getParentEdgeAt(0)->getDims())) { - config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format); - config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, format); - if (format != memory::any) { - config.inConfs[0].desc = InferenceEngine::TensorDesc(config.inConfs[0].desc.getPrecision(), - config.inConfs[0].desc.getDims(), { - config.inConfs[0].desc.getBlockingDesc().getBlockDims(), - config.inConfs[0].desc.getBlockingDesc().getOrder(), - (std::numeric_limits::max)() - }); - config.outConfs[0].desc = InferenceEngine::TensorDesc(config.outConfs[0].desc.getPrecision(), - config.outConfs[0].desc.getDims(), { - config.outConfs[0].desc.getBlockingDesc().getBlockDims(), - config.outConfs[0].desc.getBlockingDesc().getOrder(), - (std::numeric_limits::max)() - }); - } - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, format); - } -} - -void MKLDNNPowerNode::createPrimitive() { - auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); - auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); - if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) - THROW_IE_EXCEPTION << "Destination memory didn't allocate."; - if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) - THROW_IE_EXCEPTION << "Input memory didn't allocate."; - if (getSelectedPrimitiveDescriptor() == nullptr) - THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set."; -} - -void MKLDNNPowerNode::execute(mkldnn::stream strm) { - auto& srcMemory = getParentEdgeAt(0)->getMemory(); - auto& dstMemory = getChildEdgeAt(0)->getMemory(); - const size_t data_size = srcMemory.GetSize() / sizeof(float) / srcMemory.GetDims()[0] * batchToProcess(); - - const auto *src_ptr = reinterpret_cast(srcMemory.GetData()) + - srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding; - float *dst_ptr = reinterpret_cast(dstMemory.GetData()) + - dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding; - - if (power == -1.f) { - parallel_for(data_size, [&](size_t i) { - float val = src_ptr[i] * scale + shift; - dst_ptr[i] = 1 / val; - }); - } else if (power == 0.5f) { - parallel_for(data_size, [&](size_t i) { - float val = src_ptr[i] * scale + shift; - dst_ptr[i] = sqrtf(val); - }); - } else if (power == 1.0f) { - parallel_for(data_size, [&](size_t i) { - dst_ptr[i] = src_ptr[i] * scale + shift; - }); - } else if (power == 2.0f) { - parallel_for(data_size, [&](size_t i) { - float val = src_ptr[i] * scale + shift; - dst_ptr[i] = val * val; - }); - } else if (power == 3.0f) { - parallel_for(data_size, [&](size_t i) { - float val = src_ptr[i] * scale + shift; - dst_ptr[i] = val * val * val; - }); - } else { - parallel_for(data_size, [&](size_t i) { - dst_ptr[i] = pow(src_ptr[i] * scale + shift, power); - }); - } -} - -bool MKLDNNPowerNode::created() const { - return getType() == Power; -} -REG_MKLDNN_PRIM_FOR(MKLDNNPowerNode, Power); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h deleted file mode 100644 index 71103b9..0000000 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include -#include - -namespace MKLDNNPlugin { - -class MKLDNNPowerNode : public MKLDNNNode { -public: - MKLDNNPowerNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); - ~MKLDNNPowerNode() override = default; - - void getSupportedDescriptors() override; - void initSupportedPrimitiveDescriptors() override; - void createPrimitive() override; - void execute(mkldnn::stream strm) override; - bool created() const override; - -private: - float scale; - float shift; - float power; -}; - -} // namespace MKLDNNPlugin - diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp index c313107..5331dc2 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp @@ -43,10 +43,6 @@ void MKLDNNQuantizeNode::init() { THROW_IE_EXCEPTION << "Quantize layer " << getName() << " has unsupported number of parent edges at port " << i; } - if (getParentEdgesAtPort(0)[0]->getDims().ndims() < 1ul || getParentEdgesAtPort(0)[0]->getDims().ndims() > 5ul) { - THROW_IE_EXCEPTION << "Unsupported number of dimensions for input at edge 0 in Quantize layer " << getName(); - } - auto initAxisIdx = [&](size_t edgeIdx) { auto edge = getParentEdgesAtPort(edgeIdx)[0]; @@ -319,6 +315,10 @@ std::vector MKLDNNQuantizeNode::getDataFormats() const { } void MKLDNNQuantizeNode::getSupportedDescriptors() { + if (getParentEdgesAtPort(0)[0]->getDims().ndims() < 1ul || getParentEdgesAtPort(0)[0]->getDims().ndims() > 5ul) { + THROW_IE_EXCEPTION << "Unsupported number of dimensions for input at edge 0 in Quantize layer " << getName(); + } + mkldnn::memory::data_type idt = MKLDNNExtensionUtils::IEPrecisionToDataType(getInputPrecision()); mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); mkldnn::memory::data_type ddt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOutputPrecision()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp index c2f941a..afe532d 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp @@ -5,12 +5,11 @@ #include "mkldnn_reduce_node.h" #include "desc_iterator.hpp" #include "mkldnn_quantize_node.h" -#include "mkldnn_depthwise_node.h" -#include "mkldnn_activation_node.h" #include #include #include #include +#include #include #include #include "ie_parallel.hpp" diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp index b63fc70..035b452 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp @@ -5,9 +5,8 @@ #include "mkldnn_resample_node.h" #include "desc_iterator.hpp" #include "mkldnn_quantize_node.h" -#include "mkldnn_depthwise_node.h" -#include "mkldnn_activation_node.h" #include +#include "mkldnn_eltwise_node.h" #include #include #include @@ -438,64 +437,9 @@ void MKLDNNResampleNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeigh continue; } - auto* depthwiseNode = dynamic_cast(node.get()); - if (depthwiseNode) { - if (initWeights) { - auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); - MKLDNNDims depthwiseDims({static_cast(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))}); - - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - PostOpsIntBlobMemory[blob_idx]->FillZero(); - - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_weights->buffer(), - depthwiseLayer->_weights->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; - } - } - - if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { - PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->FillZero(); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - depthwiseLayer->_biases->buffer(), - depthwiseLayer->_biases->size() * - MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); - - if (depthwiseNode->isBroadcast()) { - float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; - for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { - static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; - } - } - - ops.append_depthwise(depthwiseNode->getAlgorithm(), - (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), - (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); - - blob_idx += 2; - } - } else { - ops.append_depthwise(depthwiseNode->getAlgorithm(), - nullptr, - nullptr); - } - - continue; - } - - auto* activationNode = dynamic_cast(node.get()); - if (activationNode) { - ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta()); - + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode) { + eltwiseNode->appendPostOps(ops); continue; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp index a740111..fe34f81 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp @@ -5,8 +5,6 @@ #include "mkldnn_scatter_update_node.h" #include "desc_iterator.hpp" #include "mkldnn_quantize_node.h" -#include "mkldnn_depthwise_node.h" -#include "mkldnn_activation_node.h" #include #include #include diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp index 185f9ba..9b20c89 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp @@ -39,9 +39,14 @@ std::vector opTypes = { }; std::vector eltwiseOpTypes = { + ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY, ngraph::helpers::EltwiseTypes::SUBTRACT, - ngraph::helpers::EltwiseTypes::ADD + ngraph::helpers::EltwiseTypes::DIVIDE, + ngraph::helpers::EltwiseTypes::FLOOR_MOD, + ngraph::helpers::EltwiseTypes::SQUARED_DIFF, + ngraph::helpers::EltwiseTypes::POWER, + ngraph::helpers::EltwiseTypes::MOD }; std::map additional_config = {}; diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp index 53d314c..033b20a 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp @@ -22,7 +22,6 @@ std::vector disabledTestPatterns() { R"(.*(QuantGroupConv3D).*)", // TODO: Issue 31845 R"(.*(FakeQuantizeLayerTest).*)", - R"(.*(EltwiseLayerTest).*IS=\(.*\..*\..*\..*\..*\).*secondaryInputType=PARAMETER.*opType=SCALAR.*)", // TODO: failed to downgrade to opset v0 in interpreter backend R"(.*Gather.*axis=-1.*)", // TODO: Issue 33151 diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp new file mode 100644 index 0000000..7d371b4 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp @@ -0,0 +1,327 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + LayerTestsDefinitions::EltwiseTestParams, + CPUSpecificParams> EltwiseLayerCPUTestParamsSet; + +class EltwiseLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + LayerTestsDefinitions::EltwiseTestParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = obj.param; + + std::ostringstream result; + result << LayerTestsDefinitions::EltwiseLayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } + +protected: + void SetUp() { + LayerTestsDefinitions::EltwiseTestParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = this->GetParam(); + + std::vector> inputShapes; + InferenceEngine::Precision netPrecision; + ngraph::helpers::InputLayerType secondaryInputType; + CommonTestUtils::OpType opType; + ngraph::helpers::EltwiseTypes eltwiseType; + std::map additional_config; + std::tie(inputShapes, eltwiseType, secondaryInputType, opType, netPrecision, inPrc, outPrc, inLayout, targetDevice, additional_config) = basicParamsSet; + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + std::string isaType; + if (with_cpu_x86_avx512f()) { + isaType = "jit_avx512"; + } else if (with_cpu_x86_avx2()) { + isaType = "jit_avx2"; + } else if (with_cpu_x86_sse42()) { + isaType = "jit_sse42"; + } else { + isaType = "ref"; + } + selectedType = isaType + "_" + "FP32"; + + std::vector inputShape1, inputShape2; + if (inputShapes.size() == 1) { + inputShape1 = inputShape2 = inputShapes.front(); + } else if (inputShapes.size() == 2) { + inputShape1 = inputShapes.front(); + inputShape2 = inputShapes.back(); + } else { + THROW_IE_EXCEPTION << "Incorrect number of input shapes"; + } + + configuration.insert(additional_config.begin(), additional_config.end()); + auto input = ngraph::builder::makeParams(ngPrc, {inputShape1}); + + std::vector shape_input_secondary; + switch (opType) { + case CommonTestUtils::OpType::SCALAR: { + shape_input_secondary = std::vector({1}); + break; + } + case CommonTestUtils::OpType::VECTOR: + shape_input_secondary = inputShape2; + break; + default: + FAIL() << "Unsupported Secondary operation type"; + } + + std::shared_ptr secondaryInput; + if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE || + eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD || + eltwiseType == ngraph::helpers::EltwiseTypes::MOD) { + std::vector data(ngraph::shape_size(shape_input_secondary)); + data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary)); + for (float &i : data) { + if (i == 0) { + i = 1; + } + } + secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data); + } else { + secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary); + if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { + input.push_back(std::dynamic_pointer_cast(secondaryInput)); + } + } + + auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType); + eltwise->get_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority); + function = std::make_shared(eltwise, input, "Eltwise"); + } +}; + +TEST_P(EltwiseLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "Eltwise", inFmts, outFmts, selectedType); +} + +namespace { + +std::vector secondaryInputTypes = { + ngraph::helpers::InputLayerType::CONSTANT, + ngraph::helpers::InputLayerType::PARAMETER, +}; + +std::vector opTypes = { + CommonTestUtils::OpType::VECTOR, +}; + +std::vector eltwiseOpTypes = { + ngraph::helpers::EltwiseTypes::ADD, + ngraph::helpers::EltwiseTypes::MULTIPLY, + // TODO: Disabled because memory formats filter is not propogated through ngraph transformations +// ngraph::helpers::EltwiseTypes::SUBTRACT, +// ngraph::helpers::EltwiseTypes::DIVIDE, + ngraph::helpers::EltwiseTypes::FLOOR_MOD, + ngraph::helpers::EltwiseTypes::SQUARED_DIFF, +}; + +std::map additional_config = {}; + +std::vector filterCPUSpecificParams(std::vector& paramsVector) { + auto adjustBlockedFormatByIsa = [](std::vector& formats) { + for (int i = 0; i < formats.size(); i++) { + if (formats[i] == nChw16c) + formats[i] = nChw8c; + if (formats[i] == nCdhw16c) + formats[i] = nCdhw8c; + } + }; + + if (!with_cpu_x86_avx512f()) { + for (auto& param : paramsVector) { + adjustBlockedFormatByIsa(std::get<0>(param)); + adjustBlockedFormatByIsa(std::get<1>(param)); + } + } + + return paramsVector; +} + +std::vector>> inShapes_4D = { + {{2, 4, 4, 1}}, + {{2, 17, 5, 4}}, + {{2, 17, 5, 4}, {1, 17, 1, 1}}, + {{2, 17, 5, 1}, {1, 17, 1, 4}}, +}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c, nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nhwc, nhwc}, {nhwc}, {}, {}), + CPUSpecificParams({nchw, nchw}, {nchw}, {}, {}) +}; + +const auto params_4D_FP32 = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D), + ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector>> inShapes_5D = { + {{2, 4, 3, 4, 1}}, + {{2, 17, 7, 5, 4}}, + {{2, 17, 6, 5, 4}, {1, 17, 6, 1, 1}}, + {{2, 17, 6, 5, 1}, {1, 17, 1, 1, 4}}, +}; + +std::vector cpuParams_5D = { + CPUSpecificParams({nCdhw16c, nCdhw16c}, {nCdhw16c}, {}, {}), + CPUSpecificParams({ndhwc, ndhwc}, {ndhwc}, {}, {}), + CPUSpecificParams({ncdhw, ncdhw}, {ncdhw}, {}, {}) +}; + +const auto params_5D_FP32 = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_5D), + ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector>> inShapes_4D_Blocked_Planar = { + {{2, 17, 31, 3}, {2, 1, 31, 3}}, + {{2, 17, 5, 1}, {2, 1, 1, 4}}, +}; + +std::vector cpuParams_4D_Blocked_Planar = { + CPUSpecificParams({nChw16c, nchw}, {nChw16c}, {}, {}), +}; + +const auto params_4D_FP32_Blocked_Planar = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_Blocked_Planar), + ::testing::ValuesIn(eltwiseOpTypes), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Planar))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_Blocked_Planar, EltwiseLayerCPUTest, params_4D_FP32_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector>> inShapes_4D_Planar_Blocked = { + {{2, 1, 31, 3}, {2, 17, 31, 3}}, + {{2, 1, 1, 4}, {2, 17, 5, 1}}, +}; + +std::vector cpuParams_4D_Planar_Blocked = { + CPUSpecificParams({nchw, nChw16c}, {nChw16c}, {}, {}), +}; + +const auto params_4D_FP32_Planar_Blocked = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_Planar_Blocked), + ::testing::ValuesIn(eltwiseOpTypes), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Planar_Blocked))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_Planar_Blocked, EltwiseLayerCPUTest, params_4D_FP32_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector>> inShapes_5D_Blocked_Planar = { + {{2, 17, 31, 4, 3}, {2, 1, 31, 1, 3}}, + {{2, 17, 5, 3, 1}, {2, 1, 1, 3, 4}}, +}; + +std::vector cpuParams_5D_Blocked_Planar = { + CPUSpecificParams({nCdhw16c, ncdhw}, {nCdhw16c}, {}, {}), +}; + +const auto params_5D_FP32_Blocked_Planar = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_5D_Blocked_Planar), + ::testing::ValuesIn(eltwiseOpTypes), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Blocked_Planar))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_Blocked_Planar, EltwiseLayerCPUTest, params_5D_FP32_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector>> inShapes_5D_Planar_Blocked = { + {{2, 1, 31, 1, 3}, {2, 17, 31, 4, 3}}, + {{2, 1, 1, 3, 4}, {2, 17, 5, 3, 1}}, +}; + +std::vector cpuParams_5D_Planar_Blocked = { + CPUSpecificParams({ncdhw, nCdhw16c}, {nCdhw16c}, {}, {}), +}; + +const auto params_5D_FP32_Planar_Blocked = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_5D_Planar_Blocked), + ::testing::ValuesIn(eltwiseOpTypes), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Planar_Blocked))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_Planar_Blocked, EltwiseLayerCPUTest, params_5D_FP32_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp new file mode 100644 index 0000000..fad9068 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp @@ -0,0 +1,184 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include +#include +#include +#include +#include +#include +#include +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/precision_utils.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "test_utils/cpu_test_utils.hpp" +#include "ie_system_conf.h" + +using namespace CPUTestUtils; +using InferenceEngine::Precision; +using ngraph::helpers::EltwiseTypes; +using FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + std::vector>, // Input shapes + std::vector, // Input precisions + std::vector, // Eltwise operations + bool, // With quantization + std::string // Device name +> EltwiseChainTuple; + +class EltwiseChainTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + std::vector> inputShapes; + std::vector inputPrecisions; + std::vector eltwiseOpTypes; + bool withQuantization; + std::string targetName; + std::tie(inputShapes, inputPrecisions, eltwiseOpTypes, withQuantization, targetName) = obj.param; + std::ostringstream results; + + for (int i = 0; i < inputShapes.size(); i++) { + results << "IS" << std::to_string(i) << "=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + } + for (int i = 0; i < inputPrecisions.size(); i++) { + results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i].name() << "_"; + } + for (int i = 0; i < eltwiseOpTypes.size(); i++) { + results << "Op" << std::to_string(i) << "=" << eltwiseOpTypes[i] << "_"; + } + + results << "WithQuant=" << withQuantization << "_"; + results << "targetDevice=" << targetName; + + return results.str(); + } + +protected: + void SetUp() { + threshold = 0.1f; + + std::vector> inputShapes; + std::vector inputPrecisions; + std::vector eltwiseOpTypes; + bool withQuantization; + std::tie(inputShapes, inputPrecisions, eltwiseOpTypes, withQuantization, targetDevice) = this->GetParam(); + + auto ngraphParam = ngraph::builder::makeParams(convertIE2nGraphPrc(inputPrecisions[0]), {inputShapes[0]}); + + std::vector> ngraphInputs; + for (int i = 1; i < inputPrecisions.size(); i++) { + std::vector ngraphInput1Data(ngraph::shape_size(ngraph::Shape{inputShapes[i]})); + ngraphInputs.push_back(ngraph::builder::makeConstant(convertIE2nGraphPrc(inputPrecisions[i]), ngraph::Shape{inputShapes[i]}, + ngraphInput1Data, true)); + } + + if (withQuantization) { + std::vector> eltwiseOps; + eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0])); + for (int i = 1; i < eltwiseOpTypes.size() - 1; i++) { + eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i])); + } + + std::vector constShape(inputShapes[0].size(), 1); + constShape[1] = inputShapes[0][1]; + auto fq = ngraph::builder::makeFakeQuantize(eltwiseOps[eltwiseOps.size() - 1], + ::ngraph::element::Type(::ngraph::element::Type_t::f32), + 256, constShape); + + eltwiseOps.push_back(ngraph::builder::makeEltwise(fq, ngraphInputs[eltwiseOpTypes.size() - 1], eltwiseOpTypes[eltwiseOpTypes.size() - 1])); + + ngraph::ResultVector results{std::make_shared(eltwiseOps[eltwiseOps.size() - 1])}; + function = std::make_shared(results, ngraphParam, "eltwise_chain_fq"); + } else { + std::vector> eltwiseOps; + eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0])); + for (int i = 1; i < eltwiseOpTypes.size(); i++) { + eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i])); + } + + ngraph::ResultVector results{std::make_shared(eltwiseOps[eltwiseOps.size() - 1])}; + function = std::make_shared(results, ngraphParam, "eltwise_chain"); + } + } +}; + +TEST_P(EltwiseChainTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); +} + +namespace { + +std::vector>> inputShapes { + { + {{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}}, + {{1, 48, 5, 6}, {1, 48, 1, 1}, {1, 48, 5, 6}, {1, 1, 5, 6}}, + {{1, 72, 28, 28}, {1, 72, 1, 1}, {1, 72, 1, 1}, {1, 72, 1, 1}}, + {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}, + {{1, 2, 3}, {3}, {3}, {3}}, + {{1, 12, 5, 5}, {5, 5}, {12, 5, 5}, {1}}, + {{3, 12, 5, 5}, {1, 12, 5, 1}, {3, 1, 1, 1}, {3, 12, 5, 5}}, + {{1, 1, 1, 1}, {1, 12, 5, 1}, {3, 12, 1, 5}, {3, 12, 5, 1}}, + {{1, 1, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}} + } +}; + +std::vector> inputPrecisions = { + { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 }, + { Precision::I32, Precision::I32, Precision::I32, Precision::I32 } +}; + +std::vector> eltwiseOps = { + { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT }, + { EltwiseTypes::DIVIDE, EltwiseTypes::POWER, EltwiseTypes::ADD }, +}; + +INSTANTIATE_TEST_CASE_P(smoke_EltwiseChain, EltwiseChainTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(eltwiseOps), + ::testing::Values(false), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + EltwiseChainTest::getTestCaseName); + +std::vector>> inputShapesFQ { + { + {{1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}}, + {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}, + {{2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}}, + {{2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}}, + {{2, 5, 7, 5}, {2, 5, 1, 5}, {2, 5, 7, 5}, {2, 5, 7, 5}}, + {{2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}}, + {{2, 256, 7, 5}, {2, 256, 7, 5}, {2, 256, 1, 5}, {2, 256, 7, 5}}, + {{1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}}, + {{1, 12, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}, + {{1, 12, 1, 1, 6}, {1, 12, 5, 5, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 5, 1}}, + {{1, 12, 1, 1, 1}, {1, 12, 5, 1, 7}, {3, 12, 1, 5, 7}, {3, 12, 5, 1, 7}}, + {{1, 7, 1, 1, 12}, {1, 7, 5, 1, 12}, {3, 7, 1, 5, 12}, {3, 7, 5, 1, 12}}, + {{1, 7, 1, 1, 12, 3, 7}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 3, 7}, {3, 7, 5, 1, 12, 3, 7}}, + {{1, 7, 1, 1, 12, 3, 1}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 1, 7}, {3, 7, 5, 1, 12, 3, 1}} + } +}; + +std::vector> inputPrecisionsFQ { + { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 } +}; + +INSTANTIATE_TEST_CASE_P(smoke_EltwiseChainWithFQ, EltwiseChainTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapesFQ), + ::testing::ValuesIn(inputPrecisionsFQ), + ::testing::ValuesIn(eltwiseOps), + ::testing::Values(true), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + EltwiseChainTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp index dec5e1e..22bf1a5 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp @@ -93,10 +93,25 @@ void EltwiseLayerTest::SetUp() { FAIL() << "Unsupported Secondary operation type"; } - auto secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary); - if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { - input.push_back(std::dynamic_pointer_cast(secondaryInput)); + std::shared_ptr secondaryInput; + if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE || + eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD || + eltwiseType == ngraph::helpers::EltwiseTypes::MOD) { + std::vector data(ngraph::shape_size(shape_input_secondary)); + data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary)); + for (float &i : data) { + if (i == 0) { + i = 1; + } + } + secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data); + } else { + secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary); + if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { + input.push_back(std::dynamic_pointer_cast(secondaryInput)); + } } + auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType); function = std::make_shared(eltwise, input, "Eltwise"); } diff --git a/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp b/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp index deddabb..4d05cef 100644 --- a/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp +++ b/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp @@ -564,6 +564,9 @@ std::ostream& operator<<(std::ostream & os, ngraph::helpers::EltwiseTypes type) case ngraph::helpers::EltwiseTypes::FLOOR_MOD: os << "FloorMod"; break; + case ngraph::helpers::EltwiseTypes::MOD: + os << "Mod"; + break; default: throw std::runtime_error("NOT_SUPPORTED_OP_TYPE"); } diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp index e17f2ab..d96d449 100644 --- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp +++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp @@ -261,655 +261,6 @@ std::string select_op(eltwise_test_params::opType op) { return str_op; } -class MKLDNNGraphEltwise3InputsTests: public TestsCommon, - public WithParamInterface { - std::string model_t = R"V0G0N( - - - - - __SRC_DIMS_1__ - - - - - - __SRC_DIMS_2__ - - - - - - __SRC_DIMS_3__ - - - - - - - __SRC_DIMS_1__ - - __SRC_DIMS_2__ - - __SRC_DIMS_3__ - - - - __SRC_DIMS__ - - - - - - - - - - -)V0G0N"; - -protected: - std::string getModel(eltwise_test_params p) { - std::string model = model_t; - std::string op = select_op(p.op); - - std::string src_dims1; - for (auto &dim : p.dims1) { - src_dims1 += "\n "; - src_dims1 += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1); - - std::string src_dims2; - for (auto &dim : p.dims2) { - src_dims2 += "\n "; - src_dims2 += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2); - - std::string src_dims3; - for (auto &dim : p.dims3) { - src_dims3 += "\n "; - src_dims3 += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS_3__", src_dims3); - - std::string src_dims; - std::vector dims = p.dims1; - for (int i = 0; i < dims.size(); i++) { - dims[i] = std::max(p.dims1[i], p.dims2[i]); - dims[i] = std::max(dims[i], p.dims3[i]); - } - for (auto &dim : dims) { - src_dims += "\n "; - src_dims += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims); - - std::string scale; - if (!p.scales.empty()) { - scale = std::string("coeff=\"") + to_string_c_locale(p.scales) + std::string("\""); - } - REPLACE_WITH_STR(model, "_OP_", op); - REPLACE_WITH_STR(model, "_COEFF_", scale); - - return model; - } - - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - eltwise_test_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - - InferenceEngine::Core core; - InferenceEngine::CNNNetwork network; - ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr())); - - MKLDNNGraphTestClass graph; - graph.CreateGraph(network); - - auto& nodes = graph.getNodes(); - for (int i = 0; i < nodes.size(); i++) { - if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) { - ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size()); - for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) { - p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j)); - } - ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor()); - ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType()); - } - } - InferenceEngine::SizeVector dims_src1 = p.dims1; - InferenceEngine::Layout layout1 = InferenceEngine::ANY; - switch (p.dims1.size()) { - case 4: - layout1 = InferenceEngine::NCHW; - break; - case 5: - layout1 = InferenceEngine::NCDHW; - break; - } - InferenceEngine::SizeVector dims_src2 = p.dims2; - InferenceEngine::Layout layout2 = InferenceEngine::ANY; - switch (p.dims2.size()) { - case 4: - layout2 = InferenceEngine::NCHW; - break; - case 5: - layout2 = InferenceEngine::NCDHW; - break; - } - InferenceEngine::SizeVector dims_src3 = p.dims3; - InferenceEngine::Layout layout3 = InferenceEngine::ANY; - switch (p.dims3.size()) { - case 4: - layout3 = InferenceEngine::NCHW; - break; - case 5: - layout3 = InferenceEngine::NCDHW; - break; - } - - InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src1, layout1}); - src1->allocate(); - - InferenceEngine::TBlob* srcPtr1 = dynamic_cast*>(src1.get()); - - if (srcPtr1 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - CommonTestUtils::fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1); - InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src2, layout2}); - src2->allocate(); - - InferenceEngine::TBlob* srcPtr2 = dynamic_cast*>(src2.get()); - - if (srcPtr2 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - CommonTestUtils::fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2); - InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src3, layout3}); - src3->allocate(); - - InferenceEngine::TBlob* srcPtr3 = dynamic_cast*>(src3.get()); - - if (srcPtr3 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - CommonTestUtils::fill_data_sine(src3->buffer(), src3->size(), 0.1, 0.9, 3); - InferenceEngine::BlobMap srcs; - srcs.insert(std::pair("in1", src1)); - srcs.insert(std::pair("in2", src2)); - srcs.insert(std::pair("in3", src3)); - - InferenceEngine::OutputsDataMap out; - out = network.getOutputsInfo(); - InferenceEngine::BlobMap outputBlobs; - - std::pair item = *out.begin(); - - InferenceEngine::TBlob::Ptr output; - output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); - output->allocate(); - outputBlobs[item.first] = output; - - graph.Infer(srcs, outputBlobs); - - InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); - dst_ref.allocate(); - - std::vector> src_vec = {*srcPtr1, *srcPtr2, *srcPtr3}; - - ref_eltwise(src_vec, dst_ref, p); - - compare(*output, dst_ref, 0.0005f); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -TEST_P(MKLDNNGraphEltwise3InputsTests, TestsEltwise) {} - - -INSTANTIATE_TEST_CASE_P( - TestsEltwise, MKLDNNGraphEltwise3InputsTests, - ::testing::Values( - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, { - [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); - ASSERT_EQ(3, impl.getConfig().inConfs.size()); - ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); - } - } }, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref, { - [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); - ASSERT_EQ(3, impl.getConfig().inConfs.size()); - ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); - } - } }, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref, { - [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); - ASSERT_EQ(3, impl.getConfig().inConfs.size()); - ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); - } - } }, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref, { - [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); - ASSERT_EQ(3, impl.getConfig().inConfs.size()); - ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); - } - } }, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref, { - [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); - ASSERT_EQ(3, impl.getConfig().inConfs.size()); - ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); - } - } }, - eltwise_test_params{{1, 32, 16, 16, 16},{1, 32, 16, 16, 16},{1, 32, 16, 16, 16}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, { - [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); - ASSERT_EQ(3, impl.getConfig().inConfs.size()); - ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(1).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(2).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout()); - } - } }, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref} - )); - -class MKLDNNGraphEltwise2InputsTests: public TestsCommon, - public WithParamInterface { - std::string model_t = R"V0G0N( - - - - - __SRC_DIMS_1__ - - - - - - __SRC_DIMS_2__ - - - - - - - __SRC_DIMS_1__ - - __SRC_DIMS_2__ - - - - __SRC_DIMS__ - - - - - - - - - -)V0G0N"; - -protected: - std::string getModel(eltwise_test_params p) { - std::string model = model_t; - std::string op = select_op(p.op); - - std::string src_dims1 = ""; - for (auto &dim : p.dims1) { - src_dims1 += "\n "; - src_dims1 += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1); - - std::string src_dims2 = ""; - for (auto &dim : p.dims2) { - src_dims2 += "\n "; - src_dims2 += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2); - - std::string src_dims; - std::vector dims = (p.dims1.size() >= p.dims2.size()) ? p.dims1 : p.dims2; - int i = dims.size() - 1, j = p.dims1.size() - 1, k = p.dims2.size() - 1; - for (; j >= 0 && k >= 0; i--, j--, k-- ) { - dims[i] = std::max(p.dims1[j], p.dims2[k]); - } - - for (auto &dim : dims) { - src_dims += "\n "; - src_dims += std::to_string(dim) + ""; - } - REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims); - - std::string scale; - if (!p.scales.empty()) { - scale = std::string("coeff=\"") + to_string_c_locale(p.scales) + std::string("\""); - } - REPLACE_WITH_STR(model, "_OP_", op); - REPLACE_WITH_STR(model, "_COEFF_", scale); - - return model; - } - - virtual void TearDown() { - } - - virtual void SetUp() { - try { - TestsCommon::SetUp(); - eltwise_test_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - - InferenceEngine::Core core; - InferenceEngine::CNNNetwork network; - ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr())); - - MKLDNNGraphTestClass graph; - graph.CreateGraph(network); - - auto& nodes = graph.getNodes(); - for (int i = 0; i < nodes.size(); i++) { - if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) { - ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size()); - for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) { - p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j)); - } - ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor()); - ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType()); - } - } - InferenceEngine::SizeVector dims_src1 = p.dims1; - InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src1, InferenceEngine::TensorDesc::getLayoutByDims(p.dims1) }); - src1->allocate(); - - InferenceEngine::TBlob* srcPtr1 = dynamic_cast*>(src1.get()); - - if (srcPtr1 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - - CommonTestUtils::fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1); - - InferenceEngine::SizeVector dims_src2 = p.dims2; - InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src2, InferenceEngine::TensorDesc::getLayoutByDims(p.dims2) }); - src2->allocate(); - - InferenceEngine::TBlob* srcPtr2 = dynamic_cast*>(src2.get()); - - if (srcPtr2 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - - CommonTestUtils::fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2); - - InferenceEngine::BlobMap srcs; - srcs.insert(std::pair("in1", src1)); - srcs.insert(std::pair("in2", src2)); - - InferenceEngine::OutputsDataMap out; - out = network.getOutputsInfo(); - InferenceEngine::BlobMap outputBlobs; - - std::pair item = *out.begin(); - - InferenceEngine::TBlob::Ptr output; - output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); - output->allocate(); - outputBlobs[item.first] = output; - - graph.Infer(srcs, outputBlobs); - - InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); - dst_ref.allocate(); - - std::vector> src_vec = {*srcPtr1, *srcPtr2}; - - ref_eltwise(src_vec, dst_ref, p); - - compare(*output, dst_ref, 0.0005f); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } - -}; - -TEST_P(MKLDNNGraphEltwise2InputsTests, TestsEltwise) {} - -INSTANTIATE_TEST_CASE_P( - TestsEltwise, MKLDNNGraphEltwise2InputsTests, - ::testing::Values( - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Less, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Less_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Greater, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Greater_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Equal, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Not_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref} - )); - -INSTANTIATE_TEST_CASE_P( - TestsBroadcasting, MKLDNNGraphEltwise2InputsTests, - ::testing::Values( - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Prod, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Max, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Min, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sub, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Div, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - // batch broadcasting - eltwise_test_params{{1, 3, 224},{224, 3, 1},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{2, 3, 1, 2},{1, 3, 2, 1},{}, eltwise_test_params::opType::Sub, "", 1, MKLDNNPlugin::impl_desc_type::ref} - - )); - -INSTANTIATE_TEST_CASE_P( - TestsDiffDims, MKLDNNGraphEltwise2InputsTests, - ::testing::Values( - eltwise_test_params{{},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3},{3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3},{3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref} - )); - -class MKLDNNGraphEltwiseDynBatchTests: public MKLDNNGraphEltwise3InputsTests { -protected: - virtual void SetUp() { - try { - TestsCommon::SetUp(); - eltwise_test_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - size_t MB = p.dims1[0]; - if (MB < 2) - MB = 2; - - InferenceEngine::Core core; - InferenceEngine::CNNNetwork network; - ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr())); - - auto implNet = dynamic_cast(&((InferenceEngine::ICNNNetwork&)network)); - ASSERT_NE(nullptr, implNet) << "Failed to cast ICNNNetwork to CNNNetworkImpl"; - InferenceEngine::ResponseDesc resp; - InferenceEngine::StatusCode sts = implNet->setBatchSizeReshape(MB, &resp); - ASSERT_EQ((int)InferenceEngine::StatusCode::OK, sts) << resp.msg; - - MKLDNNGraphTestClass graph; - graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}}); - graph.CreateGraph(network); - - InferenceEngine::SizeVector dims_src1 = p.dims1; - InferenceEngine::Layout layout1 = InferenceEngine::ANY; - switch (p.dims1.size()) { - case 4: - layout1 = InferenceEngine::NCHW; - break; - case 5: - layout1 = InferenceEngine::NCDHW; - break; - } - InferenceEngine::SizeVector dims_src2 = p.dims2; - InferenceEngine::Layout layout2 = InferenceEngine::ANY; - switch (p.dims2.size()) { - case 4: - layout2 = InferenceEngine::NCHW; - break; - case 5: - layout2 = InferenceEngine::NCDHW; - break; - } - InferenceEngine::SizeVector dims_src3 = p.dims3; - InferenceEngine::Layout layout3 = InferenceEngine::ANY; - switch (p.dims3.size()) { - case 4: - layout3 = InferenceEngine::NCHW; - break; - case 5: - layout3 = InferenceEngine::NCDHW; - break; - } - - InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src1, layout1}); - src1->allocate(); - - InferenceEngine::TBlob* srcPtr1 = dynamic_cast*>(src1.get()); - - if (srcPtr1 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - - fill_data(src1->buffer(), src1->size()); - InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src2, layout2}); - src2->allocate(); - - InferenceEngine::TBlob* srcPtr2 = dynamic_cast*>(src2.get()); - - if (srcPtr2 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - fill_data(src2->buffer(), src2->size()); - InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob({InferenceEngine::Precision::FP32, dims_src3, layout3}); - src3->allocate(); - - InferenceEngine::TBlob* srcPtr3 = dynamic_cast*>(src3.get()); - - if (srcPtr3 == nullptr) - FAIL() << "Cannot cast blob to TBlob."; - fill_data(src3->buffer(), src3->size()); - InferenceEngine::BlobMap srcs; - srcs.insert(std::pair("in1", src1)); - srcs.insert(std::pair("in2", src2)); - srcs.insert(std::pair("in3", src3)); - - InferenceEngine::OutputsDataMap out; - out = network.getOutputsInfo(); - InferenceEngine::BlobMap outputBlobs; - - std::pair item = *out.begin(); - - InferenceEngine::TBlob::Ptr output; - output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); - output->allocate(); - outputBlobs[item.first] = output; - - - auto checkDepthwise = [](const MKLDNNPlugin::MKLDNNNodePtr& node) { - return node->getType() == MKLDNNPlugin::Eltwise; - }; - - graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkDepthwise); - graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkDepthwise); - } catch (const InferenceEngine::details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -TEST_P(MKLDNNGraphEltwiseDynBatchTests, TestsDynBatchEltwise) {} - -// TODO: rewrite to ngraph to have reshape functionality -INSTANTIATE_TEST_CASE_P( - DISABLED_TestsDynBatchEltwise, MKLDNNGraphEltwiseDynBatchTests, - ::testing::Values( - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Pow, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref} - )); - struct precisions_test_2params { struct { std::string precision0; @@ -1022,7 +373,7 @@ INSTANTIATE_TEST_CASE_P( TestsEltwise2Precisions, MKLDNNGraphEltwise2PrecisionsTests, ::testing::Values( precisions_test_2params{ {"FP32", "FP32"}, 4, 0 }, - precisions_test_2params{ { "U8", "FP32"}, 5, 1 }, - precisions_test_2params{ {"FP32", "U8"}, 5, 1 }, - precisions_test_2params{ { "U8", "U8"}, 6, 2 } + precisions_test_2params{ { "U8", "FP32"}, 4, 0 }, + precisions_test_2params{ {"FP32", "U8"}, 4, 0 }, + precisions_test_2params{ { "U8", "U8"}, 4, 0 } )); diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp index cf0650b..84b5a08 100644 --- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp +++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp @@ -116,13 +116,12 @@ protected: graph.CreateGraph(network); auto& nodes = graph.getNodes(); for (int i = 0; i < nodes.size(); i++) { - if (nodes[i]->getType() == MKLDNNPlugin::Power) { + if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) { ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size()); for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) { p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j)); } ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor()); - ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType()); } } @@ -174,25 +173,16 @@ INSTANTIATE_TEST_CASE_P( power_test_params{ {1, 3, 13, 13}, 1, 2, 0.5f, 3, MKLDNNPlugin::impl_desc_type::unknown, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType()); ASSERT_EQ(1, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); }, [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType()); ASSERT_EQ(1, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout()); }, [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType()); ASSERT_EQ(1, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout()); }}}, power_test_params{{1, 1, 23, 23}, 3, 8, 2, 3 }, power_test_params{{1, 8, 23, 23}, 8, 2, 1, 3 }, @@ -306,7 +296,7 @@ protected: outputBlobs[item.first] = output; auto checkPower = [](const MKLDNNPlugin::MKLDNNNodePtr& node) { - return node->getType() == MKLDNNPlugin::Power; + return node->getType() == MKLDNNPlugin::Eltwise; }; graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkPower); graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkPower); @@ -325,25 +315,16 @@ INSTANTIATE_TEST_CASE_P( power_test_params{ {1, 3, 13, 13}, 1, 2, 0.5f, 3, MKLDNNPlugin::impl_desc_type::unknown, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType()); ASSERT_EQ(1, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); }, [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType()); ASSERT_EQ(1, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout()); }, [](MKLDNNPlugin::PrimitiveDescInfo impl) { - ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType()); ASSERT_EQ(1, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout()); - ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout()); }}}, power_test_params{{1, 1, 23, 23}, 3, 8, 2, 3 }, power_test_params{{1, 8, 23, 23}, 8, 2, 1, 3 }, diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp index 3752e32..4f46fa5 100644 --- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp +++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp @@ -257,14 +257,14 @@ protected: ASSERT_EQ(nodes.size(), 3); ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input); ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution); - ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise)); + ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise)); ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Output); } else { ASSERT_EQ(nodes.size(), 5); ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input); ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Reorder); ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Convolution); - ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise)); + ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise)); ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reorder); ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output); } diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp index 6e83b9b..301048d 100644 --- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp +++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp @@ -186,10 +186,9 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReorders) { for (auto &node : nodes) { if (node->getType() == MKLDNNPlugin::Reorder) { reorders_num++; - ASSERT_EQ(MKLDNNPlugin::Output, node->getChildEdgeAt(0)->getChild()->getType()); } } - ASSERT_EQ(reorders_num, 1); + ASSERT_EQ(reorders_num, 3); } TEST_F(MKLDNNGraphStructureTests, TestRedundantReorderBeforeConvWithC_3) { @@ -3781,7 +3780,7 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersForXceptionTopology) { weights->allocate(); fill_data((float *) weights->buffer(), weights->size() / sizeof(float)); InferenceEngine::TBlob::Ptr weights_ptr = InferenceEngine::TBlob::Ptr(weights); - + InferenceEngine::Core core; InferenceEngine::CNNNetwork network; ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr)); @@ -4020,7 +4019,7 @@ TEST_F(MKLDNNGraphStructureTests, TestFailedPartPlateRecognitionBarrier0001) { fill_data((float *) weights->buffer(), weights->size() / sizeof(float)); InferenceEngine::TBlob::Ptr weights_ptr = InferenceEngine::TBlob::Ptr(weights); - + InferenceEngine::Core core; InferenceEngine::CNNNetwork network; ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr)); @@ -4629,7 +4628,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConvolutionDWConvolutionSumFusing) { memset((float *) weights->buffer(), 0, weights->size()); InferenceEngine::TBlob::Ptr weights_ptr = InferenceEngine::TBlob::Ptr(weights); - + InferenceEngine::Core core; InferenceEngine::CNNNetwork network; network = core.ReadNetwork(model, weights_ptr); @@ -5127,7 +5126,7 @@ TEST_F(MKLDNNGraphStructureTests, TestGemmConvolutionWithConcat) { weights->allocate(); fill_data((float *) weights->buffer(), weights->size() / sizeof(float)); InferenceEngine::TBlob::Ptr weights_ptr = InferenceEngine::TBlob::Ptr(weights); - + InferenceEngine::Core core; InferenceEngine::CNNNetwork network; ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr)); @@ -5412,7 +5411,7 @@ TEST_F(MKLDNNGraphStructureTests, TestRefPoolingWithConcat) { weights->allocate(); fill_data((float *) weights->buffer(), weights->size() / sizeof(float)); InferenceEngine::TBlob::Ptr weights_ptr = InferenceEngine::TBlob::Ptr(weights); - + InferenceEngine::Core core; InferenceEngine::CNNNetwork network; ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr)); @@ -5566,7 +5565,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConvolutionWith2DepthwiseOpFusing) { ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input); ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Reorder); ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Convolution); - ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise)); + ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise)); ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reorder); ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output); @@ -5704,7 +5703,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConvolutionWith2EltwiseOpFusing) { ASSERT_EQ(nodes.size(), 4); ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input); ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution); - ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Activation)); + ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise)); ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Reorder); ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Output); @@ -5846,7 +5845,7 @@ TEST_F(MKLDNNGraphStructureTests, TestGemmConvolutionWith2DepthwiseOpFusing) { ASSERT_EQ(nodes.size(), 3); ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input); ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution); - ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise)); + ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise)); ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Output); InferenceEngine::TensorDesc src_desc(InferenceEngine::Precision::FP32, {1, 8, 300, 600}, InferenceEngine::NCHW); diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/test_graph.hpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/test_graph.hpp index 27ed5b3..7d381f4 100644 --- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/test_graph.hpp +++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/test_graph.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #define GARB_VAL(x) ((x + 100.0f + sin(x)) / (x + 150.f)) @@ -212,13 +213,66 @@ public: return graphNodes; } + void MoveInternalBlobsToConstLayers(InferenceEngine::details::CNNNetworkImpl* netImpl) { + auto createConstInputTo = [&](InferenceEngine::CNNLayerPtr layer, InferenceEngine::Blob::Ptr blob, std::string name) { + InferenceEngine::LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", InferenceEngine::Precision::FP32}; + auto constLayer = std::make_shared(attrs); + constLayer->blobs["custom"] = blob; + + std::vector constDims(layer->insData[0].lock()->getDims().size(), 1); + if (constDims.size() > 1) + constDims[1] = blob.get()->size(); + else + constDims[0] = blob.get()->size(); + const InferenceEngine::TensorDesc& td = {InferenceEngine::Precision::FP32, constDims, InferenceEngine::TensorDesc::getLayoutByDims(constDims)}; + + InferenceEngine::DataPtr newEdgeAfterLayer(new InferenceEngine::Data(constLayer->name, td)); + newEdgeAfterLayer->setName(constLayer->name); + getCreatorLayer(newEdgeAfterLayer) = constLayer; + getInputTo(newEdgeAfterLayer).clear(); + + + netImpl->addData(constLayer->name.c_str(), newEdgeAfterLayer); + IE_SUPPRESS_DEPRECATED_START + netImpl->addLayer(constLayer); + IE_SUPPRESS_DEPRECATED_END + + constLayer->outData.push_back(newEdgeAfterLayer); + getInputTo(newEdgeAfterLayer)[layer->name] = layer; + layer->insData.push_back(newEdgeAfterLayer); + }; + + auto all_layers = InferenceEngine::details::CNNNetSortTopologically(*netImpl); + for (auto &layer : all_layers) { + if (layer->type == "ScaleShift" && layer->insData.size() == 1) { + InferenceEngine::Blob::Ptr scalesBlob = layer->blobs["weights"]; + if (scalesBlob != nullptr) + createConstInputTo(layer, scalesBlob, "weights"); + + InferenceEngine::Blob::Ptr shiftBlob = layer->blobs["biases"]; + if (shiftBlob != nullptr) + createConstInputTo(layer, shiftBlob, "biases"); + } else if (layer->type == "PReLU" && layer->insData.size() == 1) { + InferenceEngine::Blob::Ptr scalesBlob = layer->blobs["weights"]; + if (scalesBlob != nullptr) + createConstInputTo(layer, scalesBlob, "weights"); + } + } + } + void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNPlugin::MKLDNNExtensionManager::Ptr& extMgr, MKLDNNPlugin::MKLDNNWeightsSharing::Ptr cache = {}) { if (network.getFunction()) { auto convertedNetwork = std::make_shared(network); + MoveInternalBlobsToConstLayers(convertedNetwork.get()); MKLDNNGraph::CreateGraph(static_cast(*convertedNetwork), - extMgr, cache); + extMgr, cache); } else { + InferenceEngine::details::CNNNetworkImpl* netImpl = dynamic_cast(&network); + if (netImpl == nullptr) { + THROW_IE_EXCEPTION << "unexpected network type"; + } + MoveInternalBlobsToConstLayers(netImpl); MKLDNNGraph::CreateGraph(network, extMgr, cache); } } @@ -227,9 +281,15 @@ public: MKLDNNPlugin::MKLDNNWeightsSharing::Ptr cache; if (network.getFunction()) { auto convertedNetwork = std::make_shared(network); + MoveInternalBlobsToConstLayers(convertedNetwork.get()); MKLDNNGraph::CreateGraph(static_cast(*convertedNetwork), extensionManager, cache); } else { + InferenceEngine::details::CNNNetworkImpl* netImpl = dynamic_cast(&network); + if (netImpl == nullptr) { + THROW_IE_EXCEPTION << "unexpected network type"; + } + MoveInternalBlobsToConstLayers(netImpl); MKLDNNGraph::CreateGraph(network, extensionManager, cache); } } diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn index 4b23902..d7d8ed4 160000 --- a/inference-engine/thirdparty/mkl-dnn +++ b/inference-engine/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit 4b239023043318899e1c0a3b79158a68b7efe6e4 +Subproject commit d7d8ed46078b637794bc91215e1a982bb0f1683a diff --git a/ngraph/python/tests/__init__.py b/ngraph/python/tests/__init__.py index 34e3ecb..56ac242 100644 --- a/ngraph/python/tests/__init__.py +++ b/ngraph/python/tests/__init__.py @@ -115,11 +115,6 @@ xfail_issue_38084 = xfail_test(reason="RuntimeError: AssertionFailed: layer->get xfail_issue_38085 = xfail_test(reason="RuntimeError: Interpolate operation should be converted to Interp") xfail_issue_38086 = xfail_test(reason="RuntimeError: Quantize layer input '' doesn't have blobs") xfail_issue_38087 = xfail_test(reason="RuntimeError: Cannot cast to tensor desc. Format is unsupported!") -xfail_issue_38088 = xfail_test(reason="RuntimeError: Check '((axis >= axis_range_min) && " - "(axis <= axis_range_max))' failed at " - "/openvino/ngraph/core/src/validation_util.cpp:913: " - "Split Parameter axis out of the tensor rank range .") -xfail_issue_38089 = xfail_test(reason="RuntimeError: Node 2 contains empty child edge for index 0") xfail_issue_38090 = xfail_test(reason="AssertionError: Items types are not equal") xfail_issue_38091 = xfail_test(reason="AssertionError: Mismatched elements") xfail_issue_38699 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations:" diff --git a/ngraph/python/tests/test_ngraph/test_ops_fused.py b/ngraph/python/tests/test_ngraph/test_ops_fused.py index e82c678..3fb616c 100644 --- a/ngraph/python/tests/test_ngraph/test_ops_fused.py +++ b/ngraph/python/tests/test_ngraph/test_ops_fused.py @@ -22,7 +22,6 @@ from tests import (xfail_issue_34323, skip_segfault, xfail_issue_34327, xfail_issue_36485, - xfail_issue_35923, xfail_issue_36486, xfail_issue_34314, xfail_issue_36487) @@ -418,7 +417,6 @@ def test_grn_operator(): assert np.allclose(result, expected) -@xfail_issue_35923 def test_prelu_operator(): runtime = get_runtime() diff --git a/ngraph/python/tests/test_onnx/test_backend.py b/ngraph/python/tests/test_onnx/test_backend.py index 6da6151..a72bca3 100644 --- a/ngraph/python/tests/test_onnx/test_backend.py +++ b/ngraph/python/tests/test_onnx/test_backend.py @@ -38,7 +38,6 @@ from tests import (BACKEND_NAME, xfail_issue_33616, xfail_issue_38086, xfail_issue_38087, - xfail_issue_35923, xfail_issue_36483, xfail_issue_34323, xfail_issue_35915, @@ -46,8 +45,6 @@ from tests import (BACKEND_NAME, xfail_issue_36476, xfail_issue_36478, xfail_issue_36437, - xfail_issue_38088, - xfail_issue_38089, xfail_issue_38090, xfail_issue_38091, xfail_issue_35929, @@ -220,9 +217,6 @@ tests_expected_to_fail = [ "OnnxBackendNodeModelTest.test_quantizelinear_cpu"), (xfail_issue_38087, "OnnxBackendNodeModelTest.test_convtranspose_1d_cpu"), - (xfail_issue_35923, - "OnnxBackendNodeModelTest.test_prelu_broadcast_cpu", - "OnnxBackendNodeModelTest.test_prelu_example_cpu"), (xfail_issue_36483, "OnnxBackendNodeModelTest.test_ceil_cpu", "OnnxBackendNodeModelTest.test_ceil_example_cpu"), @@ -286,10 +280,6 @@ tests_expected_to_fail = [ "OnnxBackendNodeModelTest.test_argmin_keepdims_example_select_last_index_cpu", "OnnxBackendNodeModelTest.test_argmin_keepdims_random_select_last_index_cpu", "OnnxBackendNodeModelTest.test_pow_types_float32_uint32_cpu"), - (xfail_issue_38088, - "OnnxBackendPyTorchConvertedModelTest.test_GLU_cpu"), - (xfail_issue_38089, - "OnnxBackendPyTorchConvertedModelTest.test_GLU_dim_cpu"), (xfail_issue_38090, "OnnxBackendNodeModelTest.test_where_long_example_cpu", "OnnxBackendNodeModelTest.test_mod_int64_fmod_cpu", diff --git a/ngraph/python/tests/test_onnx/test_ops_logical.py b/ngraph/python/tests/test_onnx/test_ops_logical.py index 246b52f..bbd6857 100644 --- a/ngraph/python/tests/test_onnx/test_ops_logical.py +++ b/ngraph/python/tests/test_onnx/test_ops_logical.py @@ -18,7 +18,6 @@ import onnx import pytest from tests.test_onnx.utils import run_node -from tests import xfail_issue_35915 @pytest.mark.parametrize( @@ -27,9 +26,9 @@ from tests import xfail_issue_35915 pytest.param("And", np.logical_and, np.bool), pytest.param("Or", np.logical_or, np.bool), pytest.param("Xor", np.logical_xor, np.bool), - pytest.param("Equal", np.equal, np.int32, marks=xfail_issue_35915), - pytest.param("Greater", np.greater, np.int32, marks=xfail_issue_35915), - pytest.param("Less", np.less, np.int32, marks=xfail_issue_35915), + pytest.param("Equal", np.equal, np.int32), + pytest.param("Greater", np.greater, np.int32), + pytest.param("Less", np.less, np.int32), ], ) def test_logical(onnx_op, numpy_func, data_type): diff --git a/ngraph/python/tests/test_onnx/test_ops_nonlinear.py b/ngraph/python/tests/test_onnx/test_ops_nonlinear.py index 7bb55e0..d1c8a2a 100644 --- a/ngraph/python/tests/test_onnx/test_ops_nonlinear.py +++ b/ngraph/python/tests/test_onnx/test_ops_nonlinear.py @@ -18,7 +18,7 @@ import onnx import pytest from tests.test_onnx.utils import run_node -from tests import xfail_issue_35918, xfail_issue_35923, xfail_issue_35924 +from tests import xfail_issue_35918, xfail_issue_35924 def import_and_compute(op_type, input_data, **node_attrs): @@ -71,7 +71,6 @@ def test_leaky_relu(): assert_onnx_import_equals_callable("LeakyRelu", leaky_relu, [[-3, -2, -1], [1, 2, 3]]) -@xfail_issue_35923 @pytest.mark.parametrize( "x, slope", [ -- 2.7.4