endif()
set(LAYERS
- ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_activation_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_batchnorm_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_bin_conv_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_concat_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_crop_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_deconv_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_def_conv_node.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_depthwise_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_eltwise_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_fullyconnected_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_gemm_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_memory_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_permute_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_pooling_node.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_power_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_quantize_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reorder_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reshape_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/unique.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/unsqueeze.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/softmax.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/emitter.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/interp.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_eltwise_emitters.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_mkldnn_emitters.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/argmax.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/argmax_imp.cpp
return typeDesc->getPtr();
}
-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::depthwise_forward::desc> desc) {
- this->desc.reset(new DescFwdImpl<mkldnn::depthwise_forward::desc>(desc));
-}
-
-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::depthwise_forward::desc>() {
- DescFwdImpl<mkldnn::depthwise_forward::desc> *typeDesc =
- dynamic_cast<DescFwdImpl<mkldnn::depthwise_forward::desc> *>(desc.get());
- if (typeDesc == nullptr) {
- THROW_IE_EXCEPTION << "Cannot cast descriptor!";
- }
- return typeDesc->getPtr();
-}
-
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::rnn_forward::desc>(desc));
}
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::softmax_forward::desc> desc);
operator std::shared_ptr<mkldnn::softmax_forward::desc>();
- explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::depthwise_forward::desc> desc);
- operator std::shared_ptr<mkldnn::depthwise_forward::desc>();
-
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc);
operator std::shared_ptr<mkldnn::rnn_forward::desc>();
#include <unordered_set>
#include <utility>
#include <cstring>
+#include <legacy/details/ie_cnn_network_tools.h>
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
#ifdef USE_CNNNETWORK_LPT
auto params = LayerTransformation::Params(true, // updatePrecisions
- true, // quantizeOutputs
- true, // weightsToConst
- LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations
- LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights
- true, // roundQuantizedValues
- true, // updateBiases
- true); // supportAsymmetricQuantization
+ true, // quantizeOutputs
+ true, // weightsToConst
+ LayerTransformation::QuantizedTensorAlignment::UpdateLevel, // quantizedTensorAlignmentOnActivations
+ LayerTransformation::QuantizedTensorAlignment::None, // quantizedTensorAlignmentOnWeights
+ true, // roundQuantizedValues
+ true, // updateBiases
+ true); // supportAsymmetricQuantization
LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
add<ConvolutionTransformation>(LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), "Convolution").
- addCleanup<ScaleShiftToConvolutionTransformation>(
- LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
- "ScaleShift"));
+ remove("ScaleShift").
+ remove("Power"));
transformer.transform(*_clonedNetwork);
#endif
MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork&>(*_clonedNetwork));
+ auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, std::string name) {
+ LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
+ auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
+ constLayer->blobs["custom"] = blob;
+
+ std::vector<size_t> constDims(layer->insData[0].lock()->getDims().size(), 1);
+ if (constDims.size() > 1)
+ constDims[1] = blob.get()->size();
+ else
+ constDims[0] = blob.get()->size();
+ const TensorDesc& td = {blob->getTensorDesc().getPrecision(), constDims, TensorDesc::getLayoutByDims(constDims)};
+
+ DataPtr newEdgeAfterLayer(new Data(constLayer->name, td));
+ newEdgeAfterLayer->setName(constLayer->name);
+ getCreatorLayer(newEdgeAfterLayer) = constLayer;
+ getInputTo(newEdgeAfterLayer).clear();
+
+ _clonedNetwork->addData(constLayer->name.c_str(), newEdgeAfterLayer);
+ IE_SUPPRESS_DEPRECATED_START
+ _clonedNetwork->addLayer(constLayer);
+ IE_SUPPRESS_DEPRECATED_END
+
+ constLayer->outData.push_back(newEdgeAfterLayer);
+ getInputTo(newEdgeAfterLayer)[layer->name] = layer;
+ layer->insData.push_back(newEdgeAfterLayer);
+ };
+
+ auto all_layers = details::CNNNetSortTopologically(*_clonedNetwork);
+ for (auto &layer : all_layers) {
+ if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
+ Blob::Ptr scalesBlob = layer->blobs["weights"];
+ if (scalesBlob != nullptr)
+ createConstInputTo(layer, scalesBlob, "weights");
+
+ Blob::Ptr shiftBlob = layer->blobs["biases"];
+ if (shiftBlob != nullptr) {
+ createConstInputTo(layer, shiftBlob, "biases");
+ } else if (scalesBlob != nullptr) {
+ Blob::Ptr biases = make_shared_blob<float>(scalesBlob->getTensorDesc());
+ biases->allocate();
+ auto biasesPtr = biases->buffer().as<float*>();
+ for (size_t i = 0; i < biases->size(); i++)
+ biasesPtr[i] = 0;
+
+ createConstInputTo(layer, biases, "biases");
+ }
+ } else if (layer->type == "PReLU" && layer->insData.size() == 1) {
+ Blob::Ptr scalesBlob = layer->blobs["weights"];
+ if (scalesBlob != nullptr)
+ createConstInputTo(layer, scalesBlob, "weights");
+ }
+ }
+
if (_cfg.batchLimit > 1) {
// check topology for applicability
if (!CanProcessDynBatch(*_clonedNetwork)) {
type != SoftMax &&
type != Split &&
type != Concatenation &&
- type != Power &&
type != Eltwise &&
type != Crop &&
type != BatchNormalization &&
#include "mkldnn_extension_utils.h"
#include "nodes/mkldnn_reshape_node.h"
-#include "nodes/mkldnn_activation_node.h"
#include "nodes/mkldnn_pooling_node.h"
#include "nodes/mkldnn_eltwise_node.h"
-#include "nodes/mkldnn_depthwise_node.h"
#include "nodes/mkldnn_concat_node.h"
#include "nodes/mkldnn_reorder_node.h"
#include "nodes/mkldnn_conv_node.h"
#include "nodes/mkldnn_mvn_node.h"
#include "nodes/mkldnn_resample_node.h"
#include "nodes/mkldnn_interpolate_node.h"
+#include "nodes/mkldnn_input_node.h"
#include <blob_factory.hpp>
#include <legacy/ie_layers_internal.hpp>
MergeTwoEqualScaleShifts(graph);
graph.RemoveDroppedNodes();
- MergeSigmoidAndMultiplyToSwish(graph);
- graph.RemoveDroppedNodes();
-
MergeConversions(graph);
graph.RemoveDroppedNodes();
FuseConvolutionAndZeroPoints(graph);
graph.RemoveDroppedNodes();
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
FuseConvolutionAndDepthwise(graph);
graph.RemoveDroppedNodes();
-#endif
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
FuseConvolutionAndActivation(graph);
graph.RemoveDroppedNodes();
-#endif
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
FuseConvolutionAndDepthwise(graph);
graph.RemoveDroppedNodes();
-#endif
FuseConvolutionAndQuantize(graph);
graph.RemoveDroppedNodes();
graph.SortTopologically();
graph.RemoveDroppedEdges();
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
FuseConvolutionAndDepthwise(graph);
graph.RemoveDroppedNodes();
-#endif
FusePoolingAndQuantize(graph);
graph.RemoveDroppedNodes();
}
void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableConvNode = [](MKLDNNNodePtr node) {
int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
if (parent0->getType() == Eltwise) {
- auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(parent0->getCnnLayer().get());
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get eltwise layer " << node->getName();
-
- if (eltwiseLayer->_operation != EltwiseLayer::Sub)
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
+ if (eltwiseNode->getOpType() != Subtract)
return false;
if (parent0->getParentEdges().size() != 2)
int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
if (parent0->getType() == Eltwise) {
- auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(parent0->getCnnLayer().get());
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get eltwise layer " << node->getName();
-
- if (eltwiseLayer->_operation != EltwiseLayer::Sub)
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
+ if (eltwiseNode->getOpType() != Subtract)
return false;
if (parent0->getParentEdges().size() != 2)
auto& graphNodes = graph.GetNodes();
auto isSutableScaleShiftNode = [](MKLDNNNodePtr node) {
- if (node->getType() != Depthwise)
+ if (node->getType() != Eltwise)
return false;
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Depthwise node";
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Eltwise node";
- if (depthwiseNode->getChildEdges().size() != 1)
+ if (eltwiseNode->getChildEdges().size() != 1)
return false;
- if (depthwiseNode->getAlgorithm() != depthwise_scale_shift || depthwiseNode->isBroadcast())
+ if (eltwiseNode->getOpType() != MulAdd)
return false;
return true;
if (node1->getParentEdgeAt(0) != node2->getParentEdgeAt(0))
return false;
- auto *depthwiseNode1 = dynamic_cast<MKLDNNDepthwiseNode *>(node1.get());
- auto *depthwiseNode2 = dynamic_cast<MKLDNNDepthwiseNode *>(node2.get());
+ auto *eltwiseNode1 = dynamic_cast<MKLDNNEltwiseNode *>(node1.get());
+ auto *eltwiseNode2 = dynamic_cast<MKLDNNEltwiseNode *>(node2.get());
- auto depthwiseLayer1 = depthwiseNode1->getCnnLayer();
- auto depthwiseLayer2 = depthwiseNode2->getCnnLayer();
+ auto eltwiseLayer1 = eltwiseNode1->getCnnLayer();
+ auto eltwiseLayer2 = eltwiseNode2->getCnnLayer();
- Blob::Ptr scalesBlob1 = depthwiseLayer1->blobs["weights"];
- Blob::Ptr shiftsBlob1 = depthwiseLayer1->blobs["biases"];
- Blob::Ptr scalesBlob2 = depthwiseLayer2->blobs["weights"];
- Blob::Ptr shiftsBlob2 = depthwiseLayer2->blobs["biases"];
+ Blob::Ptr scalesBlob1 = eltwiseLayer1->blobs["weights"];
+ Blob::Ptr shiftsBlob1 = eltwiseLayer1->blobs["biases"];
+ Blob::Ptr scalesBlob2 = eltwiseLayer2->blobs["weights"];
+ Blob::Ptr shiftsBlob2 = eltwiseLayer2->blobs["biases"];
if (scalesBlob1 == nullptr || shiftsBlob1 == nullptr || scalesBlob2 == nullptr || shiftsBlob2 == nullptr)
return false;
auto MergeScaleShiftNodes = [&](MKLDNNNodePtr childNode1, MKLDNNNodePtr childNode2) {
auto parentNode = childNode2->getParentEdgeAt(0)->getParent();
auto ccNode2 = childNode2->getChildEdgeAt(0)->getChild();
+
+ auto parentEdges = childNode2->parentEdges;
+ for (auto &parentEdge : parentEdges) {
+ auto p_edge = parentEdge.lock();
+ if (p_edge->getParent() == parentNode)
+ continue;
+
+ removeEdge(graph, p_edge);
+ }
+
graph.DropNode(childNode2);
MKLDNNEdgePtr remEdge;
}
}
-void MKLDNNGraphOptimizer::MergeSigmoidAndMultiplyToSwish(MKLDNNGraph& graph) {
- auto& graphNodes = graph.GetNodes();
- std::vector<MKLDNNNodePtr> newNodes;
-
- MKLDNNNodePtr parentNode;
- MKLDNNNodePtr activationNode, eltwiseNode;
- MKLDNNEdgePtr remEdge;
-
- auto areSutableChildNodes = [&]() {
- auto childNode1 = parentNode->getChildEdgeAt(0)->getChild();
- auto childNode2 = parentNode->getChildEdgeAt(1)->getChild();
-
- if (childNode1->getType() == Activation && childNode2->getType() == Eltwise) {
- activationNode = childNode1;
- eltwiseNode = childNode2;
- remEdge = parentNode->getChildEdgeAt(1);
- } else if (childNode1->getType() == Eltwise && childNode2->getType() == Activation) {
- activationNode = childNode2;
- eltwiseNode = childNode1;
- remEdge = parentNode->getChildEdgeAt(0);
- } else {
- return false;
- }
-
- if (activationNode->getParentEdges().size() != 1 || activationNode->getChildEdges().size() != 1)
- return false;
-
- if (eltwiseNode->getParentEdges().size() != 2)
- return false;
-
- if (activationNode->getChildEdgeAt(0)->getChild() != eltwiseNode)
- return false;
-
- auto *activationNodePtr = dynamic_cast<MKLDNNActivationNode *>(activationNode.get());
- if (activationNodePtr == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << activationNode->getName() << " to Activation node";
- if (activationNodePtr->getAlgorithm() != eltwise_logistic)
- return false;
-
- auto *eltwiseNodePtr = dynamic_cast<MKLDNNEltwiseNode *>(eltwiseNode.get());
- if (eltwiseNodePtr == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << eltwiseNode->getName() << " to Eltwise node";
- auto *eltwiseLayer = dynamic_cast<EltwiseLayer*>(eltwiseNode->getCnnLayer().get());
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get eltwise layer " << eltwiseNode->getName();
- if (eltwiseLayer->_operation != EltwiseLayer::Prod)
- return false;
-
- return true;
- };
-
- auto MergeToSwish = [&]() {
- // 1. Remove edge Parent-Eltwise
- remEdge->drop();
- graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), remEdge), graph.GetEdges().end());
-
- // 2. Remove Sigmoid node and edges Parent-Sigmoid and Sigmoid-Eltwise
- graph.DropNode(activationNode);
- remEdge = parentNode->getChildEdgeAt(0);
- auto oIndex = remEdge->getOutputNum();
- auto iIndex = remEdge->getInputNum();
- remEdge->drop();
- graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), remEdge), graph.GetEdges().end());
-
- // 3. Create Swish node
- CNNLayerPtr swishLayer(new CNNLayer(*activationNode->getCnnLayer().get()));
- swishLayer->name = activationNode->getName() + "_Swish";
- swishLayer->type = "Swish";
- MKLDNNNodePtr swishNode(new MKLDNNActivationNode(swishLayer, graph.getEngine(), graph.weightsCache));
-
- // 4. Create edges Parent-Swish and Swish-Eltwise, connect to Swish node, add edges to graph
- MKLDNNEdgePtr beforeSwishEdge(new MKLDNNEdge(parentNode, swishNode, iIndex, 0));
- MKLDNNEdgePtr afterSwishEdge(new MKLDNNEdge(swishNode, eltwiseNode, 0, oIndex));
- swishNode->addEdge(beforeSwishEdge);
- swishNode->addEdge(afterSwishEdge);
- graph.GetEdges().push_back(beforeSwishEdge);
- graph.GetEdges().push_back(afterSwishEdge);
- newNodes.push_back(swishNode);
-
- // 5. Remove Eltwise node
- graph.DropNode(eltwiseNode);
- };
-
- for (int i = 0; i < graphNodes.size(); i++) {
- parentNode = graphNodes[i];
- if (parentNode->getChildEdges().size() != 2)
- continue;
-
- if (!areSutableChildNodes()) continue;
-
- MergeToSwish();
- }
- for (int i = 0; i < newNodes.size(); i++) {
- graph.GetNodes().push_back(newNodes[i]);
- }
-}
-
void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
auto &graphNodes = graph.GetNodes();
return x->getName() == node_name;}) == outputNodes.end()) {
if (bn->getChildEdges().size() == 1) {
auto child = bn->getChildEdgeAt(0)->getChild();
- if (child->type == Depthwise && child->getCnnLayer()->type == "ScaleShift") {
+ if (child->type == Eltwise && child->getCnnLayer()->type == "ScaleShift") {
bn->fuseWith(child);
+
+ auto parentEdges = child->parentEdges;
+ for (auto &parentEdge : parentEdges) {
+ auto p_edge = parentEdge.lock();
+ if (p_edge->getParent()->getType() == BatchNormalization)
+ continue;
+
+ removeEdge(graph, p_edge);
+ }
+
graph.DropNode(child);
}
}
}
}
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
- auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
- for (auto a : algs) {
- if (alg == a) {
- return true;
- }
- }
- return false;
- };
-
auto& graphNodes = graph.GetNodes();
auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
if (!activation->getCnnLayer())
return false;
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(activation.get());
- return activationNode &&
- (activationNode->getAlgorithm() == eltwise_relu ||
+ return eltwiseNode &&
+ (eltwiseNode->getOpType() == Relu ||
(conv->getCnnLayer()->precision == Precision::FP32 &&
- isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp,
- eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid})));
+ IsOneOf(eltwiseNode->getOpType(), {Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})));
};
for (int i = 0; i < graphNodes.size(); i++) {
}
void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph) {
- auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
- for (auto a : algs) {
- if (alg == a) {
- return true;
- }
- }
- return false;
- };
-
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
quantizeNode->isOutputLowBroadcast() && quantizeNode->isOutputHighBroadcast() &&
!quantizeNode->isBinarization());
}
- } else if (childNode->getType() == Depthwise) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(childNode.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise layer " << childNode->getName();
+ } else if (childNode->getType() == Eltwise) {
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(childNode.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get Eltwise node " << childNode->getName();
- if (parentNode->getParentEdgesAtPort(0)[0]->getDims().ndims() != 3) {
- return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift &&
- depthwiseNode->isWithBiases()) ||
- (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
- } else {
- const auto &depthwiseLayer = depthwiseNode->getCnnLayer();
- if (depthwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get scale shift layer " << depthwiseNode->getName();
-
- if (depthwiseNode->getAlgorithm() != mkldnn::algorithm::depthwise_scale_shift)
- return false;
-
- Blob::Ptr scalesBlob = depthwiseLayer->blobs["weights"];
- if (scalesBlob == nullptr)
+ if (IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})) {
+ return true;
+ } else if (IsOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu})) {
+ if (eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() != 2)
return false;
- Blob::Ptr shiftsBlob = depthwiseLayer->blobs["biases"];
- if (shiftsBlob == nullptr)
- return false;
+ if (parentNode->getParentEdgesAtPort(0)[0]->getDims().ndims() != 3) {
+ return true;
+ } else {
+ const auto &eltwiseLayer = eltwiseNode->getCnnLayer();
+ if (eltwiseLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get scale shift layer " << eltwiseNode->getName();
- const float* scalesBufferPtr = scalesBlob->buffer().as<float*>();
- const float* shiftsBufferPtr = shiftsBlob->buffer().as<float*>();
+ if (eltwiseNode->getOpType() != MulAdd)
+ return false;
- if (scalesBlob->size() != shiftsBlob->size())
- return false;
+ Blob::Ptr scalesBlob = eltwiseLayer->blobs["weights"];
+ if (scalesBlob == nullptr)
+ return false;
- for (int i = 1; i < scalesBlob->size(); i++)
- if (scalesBufferPtr[0] != scalesBufferPtr[i])
+ Blob::Ptr shiftsBlob = eltwiseLayer->blobs["biases"];
+ if (shiftsBlob == nullptr)
return false;
- for (int i = 1; i < shiftsBlob->size(); i++)
- if (shiftsBufferPtr[0] != shiftsBufferPtr[i])
+ const float *scalesBufferPtr = scalesBlob->buffer().as<float *>();
+ const float *shiftsBufferPtr = shiftsBlob->buffer().as<float *>();
+
+ if (scalesBlob->size() != shiftsBlob->size())
return false;
- return true;
- }
- } else if (childNode->getType() == Activation) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(childNode.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << childNode->getName();
+ for (int i = 1; i < scalesBlob->size(); i++)
+ if (scalesBufferPtr[0] != scalesBufferPtr[i])
+ return false;
- return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic,
- eltwise_bounded_relu, eltwise_clamp, eltwise_swish, eltwise_hswish,
- eltwise_mish, eltwise_hsigmoid});
+ for (int i = 1; i < shiftsBlob->size(); i++)
+ if (shiftsBufferPtr[0] != shiftsBufferPtr[i])
+ return false;
+
+ return true;
+ }
+ }
}
return false;
parentNode->fuseWith(childNode);
- if (childNode->getType() == Quantize) {
+ if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
graph.DropNode(childNode);
}
}
-#endif
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
};
auto isSutableChildNode = [](MKLDNNNodePtr node) {
- if (node->getType() != Depthwise)
+ if (node->getType() != Eltwise)
return false;
if (!node->getCnnLayer())
return false;
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise node " << node->getName();
- return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
- (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
+ return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) ||
+ (eltwiseNode->getOpType() == Prelu));
};
for (int i = 0; i < graphNodes.size(); i++) {
if (isSutableChildNode(depthwise1)) {
conv->fuseWith(depthwise1);
+
+ auto parents = depthwise1->parentEdges;
+ for (size_t j = 0; j < parents.size(); j++) {
+ auto p_edge = parents[j].lock();
+ if (p_edge->getParent()->getType() == Eltwise)
+ continue;
+
+ removeEdge(graph, p_edge);
+ }
+
graph.DropNode(depthwise1);
}
}
+ auto parents = depthwise0->parentEdges;
+ for (size_t j = 0; j < parents.size(); j++) {
+ auto p_edge = parents[j].lock();
+ if (p_edge->getParent()->getType() == Convolution || p_edge->getParent()->getType() == BinaryConvolution)
+ continue;
+
+ removeEdge(graph, p_edge);
+ }
+
graph.DropNode(depthwise0);
}
}
-#endif
void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
}
void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) {
- auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
- for (auto a : algs) {
- if (alg == a) {
- return true;
- }
- }
- return false;
- };
-
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
return !quantizeNode->isBinarization();
- } else if (node->getType() == Depthwise) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
-
- return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
- (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
- } else if (node->getType() == Activation) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
+ } else if (node->getType() == Eltwise) {
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
- return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_elu, eltwise_logistic, eltwise_bounded_relu,
- eltwise_clamp, eltwise_swish, eltwise_hswish, eltwise_mish,
- eltwise_hsigmoid});
+ return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) ||
+ (eltwiseNode->getOpType() == Prelu) ||
+ IsOneOf(eltwiseNode->getOpType(), {Relu, Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid}));
}
return false;
parentNode->fuseWith(childNode);
- if (childNode->getType() == Quantize) {
+ if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
}
void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
}
void MKLDNNGraphOptimizer::FusePoolingAndQuantize(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
- auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
- for (auto a : algs) {
- if (alg == a) {
- return true;
- }
- }
- return false;
- };
-
auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
if (!activation->getCnnLayer())
return false;
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(activation.get());
- return activationNode &&
- (activationNode->getAlgorithm() == eltwise_relu ||
+ return eltwiseNode &&
+ (eltwiseNode->getOpType() == Relu ||
(conv->getCnnLayer()->precision == Precision::FP32 &&
- isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp,
- eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid})));
-#else
- return false;
-#endif
+ IsOneOf(eltwiseNode->getOpType(), {Elu, Logistic, BoundedRelu, Clamp, Swish, Hswish, Mish, Hsigmoid})));
};
for (auto &graphNode : graphNodes) {
continue;
if (!std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isSum()) continue;
- if (!std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isUnitScales()) continue;
if (std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isWithBroadcast()) continue;
// TODO: Enlarge to several inputs
#endif
void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
if (quantizeNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
return !quantizeNode->isBinarization();
- } else if (node->getType() == Depthwise) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
- return depthwiseNode->cnnLayer->type == "ScaleShift";
- } else if (node->getType() == Activation) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
- return activationNode->getAlgorithm() == eltwise_relu;
+ } else if (node->getType() == Eltwise) {
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
+
+ return ((eltwiseNode->getOpType() == MulAdd) ||
+ (eltwiseNode->getOpType() == Prelu) ||
+ eltwiseNode->getOpType() == Relu);
}
return false;
parentNode->fuseWith(childNode);
- if (childNode->getType() == Quantize) {
+ if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
}
void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
if (quantizeNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
return !quantizeNode->isBinarization();
- } else if (node->getType() == Depthwise) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
- return depthwiseNode->cnnLayer->type == "ScaleShift";
- } else if (node->getType() == Activation) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
- return activationNode->getAlgorithm() == eltwise_relu;
+ } else if (node->getType() == Eltwise) {
+ auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName();
+ return eltwiseNode->getOpType() == Relu ||
+ eltwiseNode->getOpType() == MulAdd;
}
return false;
parentNode->fuseWith(childNode);
- if (childNode->getType() == Quantize) {
+ if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
}
void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) {
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
parentNode->fuseWith(childNode);
- if (childNode->getType() == Quantize || childNode->getType() == Depthwise || childNode->getType() == Activation) {
+ if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
}
void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) {
- auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
- for (auto a : algs) {
- if (alg == a) {
- return true;
- }
- }
- return false;
- };
-
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
if (quantizeNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
return !quantizeNode->isBinarization();
- } else if (node->getType() == Depthwise) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
- return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
- (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
- } else if (node->getType() == Activation) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
- return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic,
- eltwise_bounded_relu, eltwise_clamp, eltwise_tanh, eltwise_swish, eltwise_hswish, eltwise_mish,
- eltwise_hsigmoid, eltwise_linear, eltwise_abs, eltwise_square, eltwise_sqrt});
+ } else if (node->getType() == Eltwise) {
+ auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName();
+ return IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Tanh, Swish,
+ Hswish, Mish, Hsigmoid, Linear, Abs, Square, Sqrt}) ||
+ ((eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2) ||
+ (eltwiseNode->getOpType() == Prelu));
}
+
return false;
};
parentNode->fuseWith(childNode);
- if (childNode->getType() == Quantize) {
+ if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
}
void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
- auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
- for (auto a : algs) {
- if (alg == a) {
- return true;
- }
- }
- return false;
- };
-
- auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
- auto& edges = graph.GetEdges();
- for (auto it = edges.begin(); it != edges.end(); it++) {
- if ((*it) == edge) {
- edges.erase(it);
- return;
- }
- }
- };
-
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
- bool isSutableEltwise = node->getType() == Eltwise;
+ return node->getType() == Eltwise && node->getChildEdges().size() == 1;
+ };
- if (isSutableEltwise) {
- auto *eltwiseLayer = dynamic_cast<EltwiseLayer *>(node->getCnnLayer().get());
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get Eltwise layer " << node->getName();
+ auto isSutableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+ for (auto &childParentEdge : childNode->getParentEdges()) {
+ // WA to prevent unsupported reorder exception issue in some cases
+ if (childParentEdge.lock()->getParent()->getType() == Split) {
+ return false;
+ }
- ptrdiff_t maxChannels = 1;
- for (size_t i = 0; i < node->getParentEdges().size(); i++) {
- if (node->getParentEdgeAt(0)->getDims().ndims() != node->getParentEdgeAt(i)->getDims().ndims())
- return false;
- if (node->getParentEdgeAt(i)->getDims().ndims() != 2 &&
- node->getParentEdgeAt(i)->getDims().ndims() != 4 &&
- node->getParentEdgeAt(i)->getDims().ndims() != 5)
+ // Avoid cycle dependencies
+ for (auto &parentParentEdge : parentNode->getParentEdges()) {
+ if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent())
return false;
- if (maxChannels < node->getParentEdgeAt(i)->getDims()[1])
- maxChannels = node->getParentEdgeAt(i)->getDims()[1];
}
-
- int simdWidth = mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common) ? 16 :
- mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx2) ? 8 : 4;
- if (maxChannels < simdWidth)
- return false;
-
- return node->getChildEdges().size() == 1 &&
- (eltwiseLayer->_operation == EltwiseLayer::Sum || eltwiseLayer->_operation == EltwiseLayer::Prod) &&
- !node->isFusedWith(Quantize);
- } else {
- return false;
}
- };
- auto isSutableChildNode = [&](MKLDNNNodePtr node) {
- if (!node->getCnnLayer())
+ if (!childNode->getFusedWith().empty())
return false;
- if (node->getType() == Quantize) {
- auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
- if (quantizeNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
- return !quantizeNode->isBinarization();
- } else if (node->getType() == Activation) {
- // Applicability was narrowed down in order not to affect FP32 topologies
- if (node->getChildEdges().size() != 1)
- return false;
- if (node->getChildEdgeAt(0)->getChild()->getType() != Quantize)
- return false;
-
- auto *activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
- return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_elu, eltwise_logistic, eltwise_bounded_relu,
- eltwise_clamp, eltwise_swish, eltwise_hswish, eltwise_mish,
- eltwise_hsigmoid});
- }
-
- return false;
+ auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(parentNode.get());
+ return eltwiseNode->canFuse(childNode);
};
auto parent = graphNodes.begin();
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
- if (!isSutableChildNode(childNode)) {
+ if (!isSutableChildNode(parentNode, childNode)) {
parent++;
continue;
}
removeEdge(graph, p_edge);
}
- }
- graph.DropNode(childNode);
+ graph.DropNode(childNode);
+ } else if (childNode->getType() == Eltwise) {
+ auto childs = childNode->childEdges;
+ auto parents = childNode->parentEdges;
+
+ for (size_t i = 0; i < parents.size(); i++) {
+ auto p_edge = parents[i].lock();
+ if (!p_edge) continue;
+ auto parent = p_edge->getParent();
+ if (!parent) continue;
+
+ if (parent == parentNode) {
+ for (size_t j = 0; j < childs.size(); j++) {
+ if (!childs[j].lock())
+ continue;
+ auto child = childs[j].lock()->getChild();
+ if (!child)
+ continue;
+
+ MKLDNNEdgePtr &remEdge = p_edge;
+ int inNum = 0;
+ if (remEdge) {
+ inNum = remEdge->getInputNum();
+ remEdge->drop();
+ removeEdge(graph, remEdge);
+ }
+ remEdge = childs[j].lock();
+ int outNum = 0;
+ if (remEdge) {
+ outNum = remEdge->getOutputNum();
+ remEdge->drop();
+ removeEdge(graph, remEdge);
+ }
+ MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
+ auto &graphEdges = graph.GetEdges();
+ graphEdges.push_back(newEdge);
+ parent->addEdge(newEdge);
+
+ parent->outDims[inNum] = child->inDims[outNum];
+ }
+ } else {
+ MKLDNNEdgePtr &remEdge = p_edge;
+ int inNum = 0;
+ if (remEdge) {
+ inNum = remEdge->getInputNum();
+ remEdge->drop();
+ removeEdge(graph, remEdge);
+ }
+
+ auto parentEltwise = parentNode;
+ MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size()));
+ auto &graphEdges = graph.GetEdges();
+ graphEdges.push_back(newEdge);
+ parent->addEdge(newEdge);
+
+ parentEltwise->inDims.push_back(parent->outDims[0]);
+ }
+ }
+
+ graph.DropNode(childNode);
+ } else {
+ graph.DropNode(childNode);
+ }
}
}
for (MKLDNNNodePtr& node : graph.GetNodes()) {
bool toDrop = false;
- if (node->getType() == Power) {
- PowerLayer* l = dynamic_cast<PowerLayer*>(node->getCnnLayer().get());
- if (l == nullptr)
- THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName();
+ if (node->getType() == Eltwise) {
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
+ if (eltwiseNode->getOpType() == PowerStatic) {
+ PowerLayer *l = dynamic_cast<PowerLayer *>(node->getCnnLayer().get());
+ if (l == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName();
- if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true;
+ if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true;
+ }
}
- if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
+ if (node->getType() == Eltwise && node->getCnnLayer()->type == "ScaleShift") {
ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
if (l == nullptr)
THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName();
void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
for (MKLDNNNodePtr& node : graph.GetNodes()) {
- if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
+ if (node->getType() == Eltwise && node->getCnnLayer()->type == "ScaleShift") {
ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
if (l == nullptr)
THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName();
return false;
}
+bool MKLDNNGraphOptimizer::IsOneOf(EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
+ for (auto a : algs) {
+ if (alg == a) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void MKLDNNGraphOptimizer::removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
+ auto& edges = graph.GetEdges();
+ for (auto it = edges.begin(); it != edges.end(); it++) {
+ if ((*it) == edge) {
+ edges.erase(it);
+ return;
+ }
+ }
+}
+
void MKLDNNGraphOptimizer::FuseBroadcastAndEltwise(MKLDNNGraph &graph) {
std::vector<MKLDNNNodePtr>& graphNodes = graph.GetNodes();
auto& graphNodes = graph.GetNodes();
auto isSutableClampNode = [](MKLDNNNodePtr node) {
- if (node->getType() != Activation)
+ if (node->getType() != Eltwise)
return false;
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Activation node";
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Eltwise node";
- if (activationNode->getChildEdges().size() != 1)
+ if (eltwiseNode->getChildEdges().size() != 1)
return false;
- if (activationNode->getAlgorithm() != eltwise_clamp)
+ if (eltwiseNode->getOpType() != Clamp)
return false;
return true;
};
auto fuseClampAndQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(parent.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Activation node";
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Eltwise node";
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(child.get());
if (quantizeNode == nullptr)
std::vector<float> newCropLow(cropLowData.size());
std::vector<float> newCropHigh(cropHighData.size());
for (int i = 0; i < cropLowData.size(); i++)
- newCropLow[i] = std::max(cropLowData[i], activationNode->getBeta());
+ newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getBeta());
for (int i = 0; i < cropHighData.size(); i++)
- newCropHigh[i] = std::min(cropHighData[i], activationNode->getAlpha());
+ newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getAlpha());
quantizeNode->setCropLow(newCropLow);
quantizeNode->setCropHigh(newCropHigh);
auto& graphNodes = graph.GetNodes();
auto isSutableScaleShiftNode = [](MKLDNNNodePtr node) {
- if (node->getType() != Depthwise)
+ if (node->getType() != Eltwise)
return false;
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to Depthwise node";
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot cast " << node->getName() << " to eltwise node";
- if (depthwiseNode->getChildEdges().size() != 1)
+ if (eltwiseNode->getChildEdges().size() != 1)
return false;
- if (depthwiseNode->getAlgorithm() != depthwise_scale_shift || depthwiseNode->isBroadcast())
+ if (eltwiseNode->getOpType() != MulAdd)
return false;
return true;
};
auto fuseScaleShiftAndQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(parent.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to Depthwise node";
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot cast " << parent->getName() << " to eltwise node";
- auto depthwiseLayer = depthwiseNode->getCnnLayer();
- if (depthwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get scale shift layer " << depthwiseNode->getName();
+ auto eltwiseLayer = eltwiseNode->getCnnLayer();
+ if (eltwiseLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get scale shift layer " << eltwiseNode->getName();
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(child.get());
if (quantizeNode == nullptr)
THROW_IE_EXCEPTION << "Cannot cast " << child->getName() << " to Quantize node";
- Blob::Ptr scalesBlob = depthwiseLayer->blobs["weights"];
+ Blob::Ptr scalesBlob = eltwiseLayer->blobs["weights"];
if (scalesBlob == nullptr)
return false;
- Blob::Ptr shiftsBlob = depthwiseLayer->blobs["biases"];
+ Blob::Ptr shiftsBlob = eltwiseLayer->blobs["biases"];
if (shiftsBlob == nullptr)
return false;
if (!isSutableQuantizeNode(child)) continue;
if (fuseScaleShiftAndQuantizeNodes(parent, child)) {
+ auto parentEdges = parent->parentEdges;
+ for (auto &parentEdge : parentEdges) {
+ auto p_edge = parentEdge.lock();
+ if (p_edge->getParent()->getCnnLayer()->type != "Const")
+ continue;
+
+ removeEdge(graph, p_edge);
+ }
+
graph.DropNode(parent);
}
}
#pragma once
#include "mkldnn_graph.h"
+#include "nodes/mkldnn_eltwise_node.h"
#include <vector>
namespace MKLDNNPlugin {
void ApplyImplSpecificGraphOptimizations(MKLDNNGraph& graph);
private:
- void SLTMTransform(MKLDNNGraph& graph);
void MergeConversions(MKLDNNGraph& graph);
void MergeGroupConvolution(MKLDNNGraph& graph);
void MergeTwoEqualScaleShifts(MKLDNNGraph& graph);
- void MergeSigmoidAndMultiplyToSwish(MKLDNNGraph& graph);
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
void FuseConvolutionAndActivation(MKLDNNGraph &graph);
void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
-#endif
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
-#endif
void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);
#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
void FuseClampAndQuantize(MKLDNNGraph &graph);
bool IsOneOf(Type type, std::vector<Type> types);
+ bool IsOneOf(EltwiseOpType alg, std::vector<EltwiseOpType> algs);
+
+ void removeEdge(MKLDNNGraph &graph, MKLDNNEdgePtr& edge);
};
} // namespace MKLDNNPlugin
#include <nodes/mkldnn_input_node.h>
#include <nodes/mkldnn_lrn_node.h>
#include <nodes/mkldnn_pooling_node.h>
-#include <nodes/mkldnn_power_node.h>
-#include <nodes/mkldnn_activation_node.h>
#include <nodes/mkldnn_reorder_node.h>
#include <nodes/mkldnn_reshape_node.h>
#include <nodes/mkldnn_roi_pooling_node.h>
-#include <nodes/mkldnn_depthwise_node.h>
#include <nodes/mkldnn_softmax_node.h>
#include <nodes/mkldnn_tile_node.h>
#include <nodes/mkldnn_split_node.h>
{ "Output", Output },
{ "Reorder", Reorder },
{ "Convolution", Convolution },
- { "ReLU", Activation },
- { "GELU", Activation },
- { "ELU", Activation },
- { "Sigmoid", Activation },
- { "Logistic", Activation },
- { "TanH", Activation },
- { "ReLU6", Activation },
- { "Exp", Activation },
- { "Not", Activation },
- { "Activation", Activation },
- { "Clamp", Activation },
- { "Swish", Activation },
- { "HSwish", Activation },
- { "Mish", Activation },
- { "HSigmoid", Activation },
- { "ScaleShift", Depthwise },
- { "PReLU", Depthwise },
+ { "ReLU", Eltwise },
+ { "GELU", Eltwise },
+ { "ELU", Eltwise },
+ { "Sigmoid", Eltwise },
+ { "Logistic", Eltwise },
+ { "TanH", Eltwise },
+ { "ReLU6", Eltwise },
+ { "Exp", Eltwise },
+ { "Not", Eltwise },
+ { "Activation", Eltwise },
+ { "Clamp", Eltwise },
+ { "Swish", Eltwise },
+ { "HSwish", Eltwise },
+ { "Mish", Eltwise },
+ { "HSigmoid", Eltwise },
+ { "ScaleShift", Eltwise },
+ { "PReLU", Eltwise },
{ "Norm", Lrn },
{ "LRN", Lrn },
{ "Pooling", Pooling },
{ "Split", Split },
{ "Slice", Split },
{ "Concat", Concatenation },
- { "Power", Power },
{ "Deconvolution", Deconvolution },
{ "Eltwise", Eltwise },
+ { "Mod", Eltwise },
+ { "Power", Eltwise },
{ "Crop", Crop },
{ "Reshape", Reshape },
{ "Tile", Tile },
SoftMax,
Split,
Concatenation,
- Power,
Eltwise,
Gemm,
Crop,
return "Split";
case Concatenation:
return "Concatenation";
- case Power:
- return "Power";
case Depthwise:
return "Depthwise";
case Crop:
#include <transformations/op_conversions/softplus_decomposition.hpp>
#include <transformations/op_conversions/convert_space_to_batch.hpp>
#include <transformations/op_conversions/convert_batch_to_space.hpp>
+#include <transformations/op_conversions/convert_mod.hpp>
#include <transformations/convert_precision.hpp>
#include <transformations/init_node_info.hpp>
#include <transformations/rt_info/fused_names_attribute.hpp>
pass_config->disable<ngraph::pass::ReduceL2Decomposition>();
pass_config->disable<ngraph::pass::SoftPlusDecomposition>();
pass_config->disable<ngraph::pass::HSigmoidDecomposition>();
+ pass_config->disable<ngraph::pass::ConvertMod>();
pass_config->enable<ngraph::pass::ConvertPadToGroupConvolution>();
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "emitter.h"
+#include <vector>
+
+using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl;
+using namespace Xbyak;
+
+namespace MKLDNNPlugin {
+
+template <typename T, typename P>
+constexpr bool one_of(T val, P item) { return val == item; }
+
+template <typename T, typename P, typename... Args>
+constexpr bool one_of(T val, P item, Args... item_others) {
+ return val == item || one_of(val, item_others...);
+}
+
+
+size_t jit_emitter::get_max_vecs_count() const {
+ return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 32 : 16;
+}
+
+size_t jit_emitter::get_vec_length() const {
+ return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 64 :
+ one_of(host_isa_, cpu::avx2) ? 32 : 16;
+}
+
+void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const {
+ if (host_isa_ == cpu::sse42) {
+ h->uni_vmovups(addr, Xmm(vec_idx));
+ } else if (host_isa_ == cpu::avx2) {
+ h->uni_vmovups(addr, Ymm(vec_idx));
+ } else {
+ h->uni_vmovups(addr, Zmm(vec_idx));
+ }
+}
+
+void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const {
+ if (host_isa_ == cpu::sse42) {
+ h->uni_vmovups(Xmm(vec_idx), addr);
+ } else if (host_isa_ == cpu::avx2) {
+ h->uni_vmovups(Ymm(vec_idx), addr);
+ } else {
+ h->uni_vmovups(Zmm(vec_idx), addr);
+ }
+}
+
+size_t jit_emitter::aux_vecs_count() const {
+ return 0;
+}
+
+size_t jit_emitter::aux_gprs_count() const {
+ // We need one gpr to load table address
+ return entry_map_.empty() ? 0 : 1;
+}
+
+std::set<InferenceEngine::Precision> jit_emitter::get_supported_precisions() {
+ return {InferenceEngine::Precision::FP32};
+}
+
+void jit_emitter::emitter_preamble(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &pool_vec_idxs,
+ const std::vector<size_t> &pool_gpr_idxs) {
+ using namespace Xbyak::util;
+
+ for (auto idx : pool_vec_idxs)
+ aux_vec_idxs.push_back(idx);
+
+ // For sse42 mask register has to be Xmm(0)
+ if (host_isa_ == cpu::sse42 && aux_vecs_count() > 0) {
+ size_t idx = 0;
+ assert(std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) == in_vec_idxs.end());
+ if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) {
+ aux_vec_idxs.push_back(idx);
+ preserved_vec_idxs.push_back(idx);
+ }
+
+ // moving mask vector at the beginning of aux vectors list to simplify further processing
+ for (int i = 0; i < aux_vec_idxs.size(); i++) {
+ if (aux_vec_idxs[i] == 0) {
+ size_t tmp = aux_vec_idxs[0];
+ aux_vec_idxs[0] = aux_vec_idxs[i];
+ aux_vec_idxs[i] = tmp;
+ break;
+ }
+ }
+ }
+
+ for (size_t idx = 0; idx < get_max_vecs_count(); idx++) {
+ if (aux_vec_idxs.size() >= aux_vecs_count()) break;
+
+ if (std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) != in_vec_idxs.end()) continue;
+ if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue;
+ if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue;
+
+ aux_vec_idxs.push_back(idx);
+ preserved_vec_idxs.push_back(idx);
+ }
+ assert(aux_vec_idxs.size() >= aux_vecs_count());
+
+ // Same logic but to allocate gprs
+ for (auto idx : pool_gpr_idxs)
+ aux_gpr_idxs.push_back(idx);
+
+ for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) {
+ size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end
+
+ if (aux_gpr_idxs.size() >= aux_gprs_count()) break;
+ if (_idx == Operand::RSP) continue;
+ if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue;
+ if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue;
+
+ aux_gpr_idxs.push_back(_idx);
+ preserved_gpr_idxs.push_back(_idx);
+ }
+ assert(aux_gpr_idxs.size() == aux_gprs_count());
+
+ if (!entry_map_.empty()) {
+ p_table = Reg64(aux_gpr_idxs[0]);
+ aux_gpr_idxs.erase(aux_gpr_idxs.begin());
+ }
+
+ for (size_t i = 0; i < preserved_gpr_idxs.size(); ++i)
+ h->push(Reg64(preserved_gpr_idxs[i]));
+
+ if (preserved_vec_idxs.size())
+ h->sub(h->rsp, preserved_vec_idxs.size() * get_vec_length());
+
+ for (size_t i = 0; i < preserved_vec_idxs.size(); ++i) {
+ push_vec(h->ptr[h->rsp + i * get_vec_length()], preserved_vec_idxs[i]);
+ }
+
+ if (!entry_map_.empty())
+ load_table_addr();
+}
+
+
+void jit_emitter::emitter_postamble() {
+ using namespace Xbyak::util;
+
+ for (size_t i = 0; i < preserved_vec_idxs.size(); ++i)
+ pop_vec(preserved_vec_idxs[i], h->ptr[h->rsp + i * get_vec_length()]);
+
+ if (preserved_vec_idxs.size())
+ h->add(h->rsp, preserved_vec_idxs.size() * get_vec_length());
+
+ for (int i = aux_gprs_count() - 1; i >= 0; --i)
+ h->pop(Reg64(preserved_gpr_idxs[i]));
+
+ preserved_vec_idxs.clear();
+ preserved_gpr_idxs.clear();
+
+ aux_vec_idxs.clear();
+ aux_gpr_idxs.clear();
+}
+
+void jit_emitter::emit_table() {
+ h->align(64);
+ h->L(l_table);
+
+ // Assumption: entries can be inserted with dd, so they should be 4 bytes.
+ assert(sizeof(table_entry_val_t) == 4);
+
+ // Run through the map and insert values stored there
+ for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) {
+ const auto &te = (*it).second; // get map entry for a given key
+ const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t);
+ for (size_t d = 0; d < len; d += sizeof(table_entry_val_t))
+ h->dd(te.val);
+ }
+}
+
+void jit_emitter::prepare_table() {
+ register_table_entries();
+
+ // Now that we registered the entries, we set the offsets. No
+ // entries should be registered after this point. This allows to
+ // expect the same order when injecting the table entries in
+ // prepare_table.
+ size_t off = 0;
+ for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) {
+ auto &te = (*it).second;
+ te.off = off;
+ off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t);
+ }
+}
+
+void jit_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ emitter_preamble(in_vec_idxs, pool_vec_idxs, pool_gpr_idxs);
+
+ emit_impl(in_vec_idxs, out_vec_idxs, pool_vec_idxs, pool_gpr_idxs);
+
+ emitter_postamble();
+}
+
+} // namespace MKLDNNPlugin
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include "jit_generator.hpp"
+#include "mkldnn_node.h"
+#include <set>
+
+namespace MKLDNNPlugin {
+
+class jit_emitter {
+public:
+ jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
+ : h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) {
+ k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well
+ }
+
+ virtual void emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs = {}, const std::vector<size_t> &pool_gpr_idxs = {});
+ virtual void emit_table();
+ virtual size_t get_inputs_num() = 0;
+ virtual size_t aux_vecs_count() const;
+ static std::set<InferenceEngine::Precision> get_supported_precisions();
+
+protected:
+ virtual size_t aux_gprs_count() const;
+
+ size_t get_max_vecs_count() const;
+ size_t get_vec_length() const;
+
+ const MKLDNNNode& n;
+ mkldnn::impl::cpu::jit_generator* h;
+ mkldnn::impl::cpu::cpu_isa_t host_isa_;
+ InferenceEngine::Precision exec_prc_;
+
+ Xbyak::Opmask k_mask;
+
+ virtual void prepare_table();
+ virtual void register_table_entries() {}
+
+ void load_table_addr() { h->mov(p_table, l_table); }
+
+ // we accept only 32bit hexadecimal table values to avoid any rounding
+ using table_entry_val_t = uint32_t;
+ using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table
+ using table_entry_bcast_t = bool; // true => bcast value
+
+ struct table_entry_t {
+ table_entry_val_t val;
+ table_entry_bcast_t bcast;
+ };
+ struct mapped_table_entry_t {
+ table_entry_offset_t off;
+ table_entry_val_t val;
+ table_entry_bcast_t bcast;
+ };
+
+ Xbyak::Reg64 p_table;
+ Xbyak::Label l_table;
+
+ enum {
+ _cmp_eq_oq = mkldnn::impl::cpu::jit_generator::_cmp_eq_oq,
+ _cmp_neq_uq = mkldnn::impl::cpu::jit_generator::_cmp_neq_uq,
+ _cmp_lt_os = mkldnn::impl::cpu::jit_generator::_cmp_lt_os,
+ _cmp_le_os = mkldnn::impl::cpu::jit_generator::_cmp_le_os,
+ _cmp_ge_os = mkldnn::impl::cpu::jit_generator::_cmp_nlt_us,
+ _cmp_gt_os = mkldnn::impl::cpu::jit_generator::_cmp_nle_us,
+ };
+
+ virtual void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {}
+
+ virtual void emitter_preamble(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &pool_vec_idxs,
+ const std::vector<size_t> &pool_gpr_idxs);
+ virtual void emitter_postamble();
+
+ std::vector<size_t> aux_vec_idxs;
+ std::vector<size_t> aux_gpr_idxs;
+
+ static constexpr int k_mask_size = 8;
+
+ Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
+ auto off = table_off(key, key_off_val_shift);
+ return h->ptr[p_table + off];
+ }
+
+ using table_t = std::multimap<std::string, table_entry_t>;
+ using mapped_table_t = std::multimap<std::string, mapped_table_entry_t>;
+
+ mapped_table_t entry_map_;
+
+ void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) {
+ mapped_table_entry_t te {0, val, broadcast};
+ entry_map_.insert(std::make_pair(key, te));
+ }
+
+ void push_entries_of(const table_t &t) {
+ for (auto it = t.begin(); it != t.end(); it++) {
+ auto key = (*it).first;
+ auto te = (*it).second; // copy values from table
+ push_arg_entry_of(key, te.val, te.bcast);
+ }
+ }
+
+private:
+ std::vector<size_t> preserved_vec_idxs;
+ std::vector<size_t> preserved_gpr_idxs;
+
+ void push_vec(const Xbyak::Address &addr, size_t vec_idx) const;
+ void pop_vec(size_t vec_idx, const Xbyak::Address &addr) const;
+
+ size_t table_off(std::string& key, size_t key_off_val_shift = 0) const {
+ // assumption: all table entries sharing the same key also
+ // share their broadcast property
+ // TODO: enforce through data structure
+ const auto it = entry_map_.find(key); // search an entry for a key
+ assert(it != entry_map_.end());
+ const auto &te = (*it).second;
+ const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t);
+ return te.off + key_off_val_shift * scale;
+ }
+};
+
+} // namespace MKLDNNPlugin
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common/emitter.h"
+#include "jit_eltwise_emitters.hpp"
+#include "mkldnn_eltwise_node.h"
+#include "jit_uni_eltwise.hpp"
+#include "legacy/ie_layers.h"
+
+using namespace InferenceEngine;
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::cpu;
+using namespace Xbyak;
+
+namespace MKLDNNPlugin {
+
+/// ADD ///
+jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_add_emitter::get_inputs_num() { return 2; }
+
+void jit_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
+ } else {
+ h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1);
+ }
+}
+
+/// MUL_ADD ///
+jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_mul_add_emitter::get_inputs_num() { return 3; }
+
+void jit_mul_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_mul_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_src2 = Vmm(in_vec_idxs[2]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->mulps(vmm_dst, vmm_src1);
+ h->addps(vmm_dst, vmm_src2);
+ } else {
+ Vmm vmm_mul0;
+ if (vmm_dst.getIdx() == vmm_src0.getIdx()) {
+ h->uni_vmovups(vmm_aux0, vmm_src0);
+ vmm_mul0 = vmm_aux0;
+ } else {
+ vmm_mul0 = vmm_src0;
+ }
+
+ Vmm vmm_mul1;
+ if (vmm_dst.getIdx() == vmm_src1.getIdx()) {
+ h->uni_vmovups(vmm_aux0, vmm_src1);
+ vmm_mul1 = vmm_aux0;
+ } else {
+ vmm_mul1 = vmm_src1;
+ }
+
+ if (vmm_dst.getIdx() != vmm_src2.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src2);
+ h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1);
+ }
+}
+
+size_t jit_mul_add_emitter::aux_vecs_count() const {
+ return 1;
+}
+
+/// SUB ///
+jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_subtract_emitter::get_inputs_num() { return 2; }
+
+void jit_subtract_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
+ } else {
+ h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
+ }
+}
+
+
+/// MULTIPLY ///
+jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_multiply_emitter::get_inputs_num() { return 2; }
+
+void jit_multiply_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
+ } else {
+ h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1);
+ }
+}
+
+
+/// DIVIDE ///
+jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_divide_emitter::get_inputs_num() { return 2; }
+
+void jit_divide_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vdivps(vmm_dst, vmm_dst, vmm_src1);
+ } else {
+ h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
+ }
+}
+
+
+/// FLOOR_MOD ///
+jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_floor_mod_emitter::get_inputs_num() { return 2; }
+
+void jit_floor_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_floor_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ if (vmm_dst.getIdx() != vmm_src0.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vmovups(vmm_aux0, vmm_src0);
+ h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1);
+ h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down
+ h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+ h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+ } else {
+ if (vmm_dst.getIdx() != vmm_src0.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down
+ h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+ h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+ }
+}
+
+size_t jit_floor_mod_emitter::aux_vecs_count() const {
+ return 1;
+}
+
+/// MOD ///
+jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_mod_emitter::get_inputs_num() { return 2; }
+
+void jit_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ if (vmm_dst.getIdx() != vmm_src0.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vmovups(vmm_aux0, vmm_src0);
+ h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1);
+ h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate
+ h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+ h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+ } else {
+ if (vmm_dst.getIdx() != vmm_src0.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate
+ h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1);
+ h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0);
+ }
+}
+
+size_t jit_mod_emitter::aux_vecs_count() const {
+ return 1;
+}
+
+/// MAXIMUM ///
+jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_maximum_emitter::get_inputs_num() { return 2; }
+
+void jit_maximum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_maximum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ auto uni_vmax = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+ switch (exec_prc_) {
+ case Precision::FP32: h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); break;
+ case Precision::I32: h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); break;
+ default: assert(!"unsupported precision");
+ }
+ };
+
+ if (isa == cpu::sse42) {
+ if (vmm_src0.getIdx() != vmm_dst.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ uni_vmax(vmm_dst, vmm_dst, vmm_src1);
+ } else {
+ uni_vmax(vmm_dst, vmm_src0, vmm_src1);
+ }
+}
+
+std::set<InferenceEngine::Precision> jit_maximum_emitter::get_supported_precisions() {
+ return {Precision::FP32, Precision::I32};
+}
+
+/// MINIMUM ///
+jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_minimum_emitter::get_inputs_num() { return 2; }
+
+void jit_minimum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_minimum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ auto uni_vmin = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+ switch (exec_prc_) {
+ case Precision::FP32: h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); break;
+ case Precision::I32: h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); break;
+ default: assert(!"unsupported precision");
+ }
+ };
+
+ if (isa == cpu::sse42) {
+ if (vmm_src0.getIdx() != vmm_dst.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ uni_vmin(vmm_dst, vmm_dst, vmm_src1);
+ } else {
+ uni_vmin(vmm_dst, vmm_src0, vmm_src1);
+ }
+}
+
+std::set<InferenceEngine::Precision> jit_minimum_emitter::get_supported_precisions() {
+ return {Precision::FP32, Precision::I32};
+}
+
+/// SQUARED_DIFFERENCE ///
+jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_squared_difference_emitter::get_inputs_num() { return 2; }
+
+void jit_squared_difference_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ if (isa == cpu::sse42) {
+ if (vmm_src0.getIdx() != vmm_dst.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
+ h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+ } else {
+ h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
+ h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+ }
+}
+
+
+/// POWER_DYNAMIC ///
+jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {}
+
+size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; }
+
+void jit_power_dynamic_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ Xmm xmm0 = Xmm(0), xmm1 = Xmm(1);
+
+ // caller obligation to save gprs as callee may use them
+ size_t gpr_size = 8;
+ Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
+ h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
+ size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+ h->sub(h->rsp, n_gprs_to_save * gpr_size);
+ for (size_t i = 0; i < n_gprs_to_save; ++i)
+ h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+ // caller obligation to save k-regs as callee may use them
+ size_t n_k_regs_to_save = 8;
+ if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+ h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
+ for (size_t i = 0; i < n_k_regs_to_save; ++i) {
+ if (mayiuse(avx512_core))
+ h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+ else
+ h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+ }
+ }
+
+ // 1. Caller obligation to save vector registers as callee may use them.
+ // 2. Additionally save space for vmm_src, to put the answer in-place on
+ // this space and space for beta.
+ // 3. There is an implicit assumption that the host code uses the same
+ // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+ // `vlen` should be replaced with `host_isa::vlen` and
+ // `host_isa::vecs_count`.
+ h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+ for (size_t i = 2; i < get_max_vecs_count() + 2; ++i)
+ h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2));
+ h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src
+ h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta
+
+ // save function address in gpr to pass in in call instruction
+ h->mov(h->rbp, reinterpret_cast<uintptr_t>(powf));
+
+ // align stack on 16-byte as ABI requires
+ h->mov(h->rbx, h->rsp);
+ h->and_(h->rbx, 0xf);
+ h->sub(h->rsp, h->rbx);
+
+ // Take src, apply powf on it and replace value on a stack with dst.
+ for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) {
+ const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)];
+ h->uni_vmovss(xmm0, source);
+ h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]);
+ h->call(h->rbp);
+ h->uni_vmovss(source, xmm0);
+ }
+
+ h->add(h->rsp, h->rbx);
+
+ // restore vector registers
+ for (size_t i = get_max_vecs_count() + 1; i >= 2; --i)
+ h->uni_vmovups(Vmm(i - 2), h->ptr[h->rsp + i * get_vec_length()]);
+ h->uni_vmovups(vmm_dst, h->ptr[h->rsp + 0 * get_vec_length()]);
+ h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+
+ // restore k registers
+ if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+ for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
+ if (mayiuse(avx512_core))
+ h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+ else
+ h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+ }
+ h->add(h->rsp, n_k_regs_to_save * k_mask_size);
+ }
+
+ // restore gpr registers
+ for (int i = n_gprs_to_save - 1; i >= 0; --i)
+ h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+ h->add(h->rsp, n_gprs_to_save * gpr_size);
+}
+
+
+/// EQUAL ///
+jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->movups(vmm_aux0, vmm_src0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+ h->movups(vmm_aux1, table_val("one"));
+ h->pxor(vmm_dst, vmm_dst);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+ }
+}
+
+void jit_equal_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_equal_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// NOT_EQUAL ///
+jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_not_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_not_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_not_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->movups(vmm_aux0, vmm_src0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+ h->movups(vmm_dst, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vmovups(vmm_dst, table_val("one"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq);
+ h->uni_vmovups(vmm_dst, table_val("one"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("zero"));
+ }
+}
+
+void jit_not_equal_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_not_equal_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// GREATER ///
+jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_greater_emitter::get_inputs_num() { return 2; }
+
+void jit_greater_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_greater_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->movups(vmm_aux0, vmm_src0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os);
+ h->movups(vmm_aux1, table_val("one"));
+ h->pxor(vmm_dst, vmm_dst);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_gt_os);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+ }
+}
+
+void jit_greater_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_greater_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// GREATER_EQUAL ///
+jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_greater_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_greater_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_greater_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->movups(vmm_aux0, vmm_src0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os);
+ h->movups(vmm_aux1, table_val("one"));
+ h->pxor(vmm_dst, vmm_dst);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_ge_os);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+ }
+}
+
+void jit_greater_equal_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_greater_equal_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// LESS ///
+jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_less_emitter::get_inputs_num() { return 2; }
+
+void jit_less_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_less_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->movups(vmm_aux0, vmm_src0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os);
+ h->movups(vmm_aux1, table_val("one"));
+ h->pxor(vmm_dst, vmm_dst);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_lt_os);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+ }
+}
+
+void jit_less_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_less_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// LESS_EQUAL ///
+jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_less_equal_emitter::get_inputs_num() { return 2; }
+
+void jit_less_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_less_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->movups(vmm_aux0, vmm_src0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os);
+ h->movups(vmm_aux1, table_val("one"));
+ h->pxor(vmm_dst, vmm_dst);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_le_os);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+ }
+}
+
+void jit_less_equal_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_less_equal_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// LOGICAL_AND ///
+jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_logical_and_emitter::get_inputs_num() { return 2; }
+
+void jit_logical_and_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+ Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
+
+ if (isa == cpu::sse42) {
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+ h->movups(vmm_dst, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_dst, vmm_aux1);
+
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+ h->movups(vmm_aux2, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_aux2, vmm_aux1);
+
+ h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2);
+ } else if (isa == cpu::avx2) {
+ h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+ h->uni_vmovups(vmm_dst, table_val("one"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+
+ h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero"));
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1);
+
+ h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero"));
+
+ h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq);
+ h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero"));
+
+ h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0);
+ }
+}
+
+void jit_logical_and_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_and_emitter::aux_vecs_count() const {
+ return 3;
+}
+
+
+/// LOGICAL_OR ///
+jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_logical_or_emitter::get_inputs_num() { return 2; }
+
+void jit_logical_or_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+ Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
+
+ if (isa == cpu::sse42) {
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+ h->movups(vmm_dst, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_dst, vmm_aux1);
+
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+ h->movups(vmm_aux2, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_aux2, vmm_aux1);
+
+ h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2);
+ } else if (isa == cpu::avx2) {
+ h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+ h->uni_vmovups(vmm_dst, table_val("one"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+
+ h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero"));
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1);
+
+ h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero"));
+
+ h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq);
+ h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero"));
+
+ h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0);
+ }
+}
+
+void jit_logical_or_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_or_emitter::aux_vecs_count() const {
+ return 3;
+}
+
+/// LOGICAL_XOR ///
+jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_logical_xor_emitter::get_inputs_num() { return 2; }
+
+void jit_logical_xor_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+ Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
+
+ if (isa == cpu::sse42) {
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+ h->movups(vmm_dst, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_dst, vmm_aux1);
+
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
+ h->movups(vmm_aux2, table_val("one"));
+ h->pxor(vmm_aux1, vmm_aux1);
+ h->blendvps(vmm_aux2, vmm_aux1);
+
+ h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2);
+ } else if (isa == cpu::avx2) {
+ h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+ h->uni_vmovups(vmm_dst, table_val("one"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
+
+ h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero"));
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1);
+
+ h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero"));
+
+ h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq);
+ h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero"));
+
+ h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0);
+ }
+}
+
+void jit_logical_xor_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_xor_emitter::aux_vecs_count() const {
+ return 3;
+}
+
+/// LOGICAL_NOT ///
+jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_logical_not_emitter::get_inputs_num() { return 1; }
+
+void jit_logical_not_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_logical_not_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
+ h->movups(vmm_aux1, table_val("one"));
+ h->pxor(vmm_dst, vmm_dst);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
+ } else {
+ h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq);
+ h->uni_vmovups(vmm_dst, table_val("zero"));
+ h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one"));
+ }
+}
+
+void jit_logical_not_emitter::register_table_entries() {
+ push_arg_entry_of("zero", 0x00000000, true);
+ push_arg_entry_of("one", 0x3f800000, true);
+}
+
+size_t jit_logical_not_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+/// POWER_STATIC ///
+jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_power_static_emitter::get_inputs_num() { return 1; }
+
+void jit_power_static_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+ auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
+ if (powerLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot convert power layer.";
+
+ float power = powerLayer->power;
+ float scale = powerLayer->scale;
+ float shift = powerLayer->offset;
+
+ Xmm xmm0 = Xmm(0), xmm1 = Xmm(1);
+
+ if (scale != 1.f || shift != 0.f) {
+ if (isa == cpu::sse42) {
+ h->uni_vmovups(vmm_aux0, table_val("scale"));
+ h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0);
+ h->uni_vmovups(vmm_dst, table_val("shift"));
+ h->uni_vaddps(vmm_dst, vmm_dst, vmm_aux0);
+ } else {
+ if (vmm_dst.getIdx() != vmm_src0.getIdx()) {
+ h->uni_vmovups(vmm_dst, table_val("shift"));
+ h->uni_vfmadd231ps(vmm_dst, vmm_src0, table_val("scale"));
+ } else {
+ h->uni_vmovups(vmm_aux0, table_val("shift"));
+ h->uni_vfmadd231ps(vmm_aux0, vmm_src0, table_val("scale"));
+ h->uni_vmovups(vmm_dst, vmm_aux0);
+ }
+ }
+ } else {
+ if (vmm_dst.getIdx() != vmm_src0.getIdx())
+ h->uni_vmovups(vmm_dst, vmm_src0);
+ }
+
+ if (power == 1.f) {
+ } else if (power == 0.5f || power == -0.5f) {
+ h->uni_vsqrtps(vmm_dst, vmm_dst);
+
+ if (power < 0.f) {
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ if (isa == cpu::sse42) {
+ h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
+ h->uni_vmovups(vmm_dst, vmm_aux0);
+ } else {
+ h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst);
+ }
+ }
+ } else if (std::floor(power) == power && power != 0) {
+ int ipower = std::abs(static_cast<int>(power));
+ h->uni_vmovups(vmm_aux0, vmm_dst);
+ for (int i = 1; i < ipower; i++) {
+ h->uni_vmulps(vmm_dst, vmm_dst, vmm_aux0);
+ }
+
+ if (power < 0.f) {
+ h->uni_vmovups(vmm_aux0, table_val("one"));
+ if (isa == cpu::sse42) {
+ h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
+ h->uni_vmovups(vmm_dst, vmm_aux0);
+ } else {
+ h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst);
+ }
+ }
+ } else {
+ h->uni_vmovups(vmm_aux0, table_val("power"));
+
+ // caller obligation to save gprs as callee may use them
+ size_t gpr_size = 8;
+ Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
+ h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
+ size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+ h->sub(h->rsp, n_gprs_to_save * gpr_size);
+ for (size_t i = 0; i < n_gprs_to_save; ++i)
+ h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+ // caller obligation to save k-regs as callee may use them
+ size_t n_k_regs_to_save = 8;
+ if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+ h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
+ for (size_t i = 0; i < n_k_regs_to_save; ++i) {
+ if (mayiuse(avx512_core))
+ h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+ else
+ h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i));
+ }
+ }
+
+ // 1. Caller obligation to save vector registers as callee may use them.
+ // 2. Additionally save space for vmm_src, to put the answer in-place on
+ // this space and space for beta.
+ // 3. There is an implicit assumption that the host code uses the same
+ // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+ // `vlen` should be replaced with `host_isa::vlen` and
+ // `host_isa::vecs_count`.
+ h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+ for (size_t i = 2; i < get_max_vecs_count() + 2; ++i)
+ h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2));
+ h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src
+ h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta
+
+ // save function address in gpr to pass in in call instruction
+ h->mov(h->rbp, reinterpret_cast<uintptr_t>(powf));
+
+ // align stack on 16-byte as ABI requires
+ h->mov(h->rbx, h->rsp);
+ h->and_(h->rbx, 0xf);
+ h->sub(h->rsp, h->rbx);
+
+ // Take src, apply powf on it and replace value on a stack with dst.
+ for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) {
+ const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)];
+ h->uni_vmovss(xmm0, source);
+ h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]);
+ h->call(h->rbp);
+ h->uni_vmovss(source, xmm0);
+ }
+
+ h->add(h->rsp, h->rbx);
+
+ // restore vector registers
+ for (size_t i = get_max_vecs_count() + 1; i >= 2; --i)
+ h->uni_vmovups(Vmm(i - 2), h->ptr[h->rsp + i * get_vec_length()]);
+ h->uni_vmovups(vmm_dst, h->ptr[h->rsp + 0 * get_vec_length()]);
+ h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
+
+ // restore k registers
+ if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+ for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
+ if (mayiuse(avx512_core))
+ h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+ else
+ h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+ }
+ h->add(h->rsp, n_k_regs_to_save * k_mask_size);
+ }
+
+ // restore gpr registers
+ for (int i = n_gprs_to_save - 1; i >= 0; --i)
+ h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+ h->add(h->rsp, n_gprs_to_save * gpr_size);
+ }
+}
+
+void jit_power_static_emitter::register_table_entries() {
+ auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
+ if (powerLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot convert power layer.";
+
+ float power_ = powerLayer->power;
+ float scale_ = powerLayer->scale;
+ float shift_ = powerLayer->offset;
+
+ push_arg_entry_of("power", float2int(power_), true);
+ push_arg_entry_of("scale", float2int(scale_), true);
+ push_arg_entry_of("shift", float2int(shift_), true);
+ push_arg_entry_of("one", float2int(1.f), true);
+}
+
+size_t jit_power_static_emitter::aux_vecs_count() const {
+ return 1;
+}
+
+/// PRELU ///
+jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
+: jit_emitter(host, host_isa, node, exec_prc) {
+ prepare_table();
+}
+
+size_t jit_prelu_emitter::get_inputs_num() { return 2; }
+
+void jit_prelu_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx2) {
+ emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
+ } else if (host_isa_ == cpu::avx512_common) {
+ emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+template <mkldnn::impl::cpu::cpu_isa_t isa>
+void jit_prelu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+ using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+ Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
+ Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
+ Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+ Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+ Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
+
+ if (isa == cpu::sse42) {
+ h->pxor(vmm_aux0, vmm_aux0);
+ h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os);
+ h->movups(vmm_aux1, vmm_src1);
+ h->mulps(vmm_aux1, vmm_src0);
+ if (vmm_src0.getIdx() != vmm_dst.getIdx())
+ h->movups(vmm_dst, vmm_src0);
+ h->blendvps(vmm_dst, vmm_aux1);
+ } else if (isa == cpu::avx2) {
+ h->vmulps(vmm_aux0, vmm_src0, vmm_src1);
+ h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1);
+ h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1);
+ h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1);
+ } else if (isa == cpu::avx512_common) {
+ h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0);
+ if (vmm_src0.getIdx() != vmm_dst.getIdx())
+ h->vmovups(vmm_dst, vmm_src0);
+ h->vcmpps(k_mask, vmm_src0, vmm_aux0, _cmp_lt_os);
+ h->vmulps(vmm_dst | k_mask, vmm_src0, vmm_src1);
+ }
+}
+
+size_t jit_prelu_emitter::aux_vecs_count() const {
+ return 2;
+}
+
+} // namespace MKLDNNPlugin
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "common/emitter.h"
+#include "jit_generator.hpp"
+#include "mkldnn_node.h"
+
+namespace MKLDNNPlugin {
+
+class jit_add_emitter : public jit_emitter {
+public:
+ jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+class jit_mul_add_emitter : public jit_emitter {
+public:
+ jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_subtract_emitter : public jit_emitter {
+public:
+ jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_multiply_emitter : public jit_emitter {
+public:
+ jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_divide_emitter : public jit_emitter {
+public:
+ jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_floor_mod_emitter : public jit_emitter {
+public:
+ jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_mod_emitter : public jit_emitter {
+public:
+ jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_maximum_emitter : public jit_emitter {
+public:
+ jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+ static std::set<InferenceEngine::Precision> get_supported_precisions();
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_minimum_emitter : public jit_emitter {
+public:
+ jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+ static std::set<InferenceEngine::Precision> get_supported_precisions();
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_squared_difference_emitter : public jit_emitter {
+public:
+ jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_power_dynamic_emitter : public jit_emitter {
+public:
+ jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
+
+
+class jit_equal_emitter : public jit_emitter {
+public:
+ jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_not_equal_emitter : public jit_emitter {
+public:
+ jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_greater_emitter : public jit_emitter {
+public:
+ jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_greater_equal_emitter : public jit_emitter {
+public:
+ jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_less_emitter : public jit_emitter {
+public:
+ jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_less_equal_emitter : public jit_emitter {
+public:
+ jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_logical_and_emitter : public jit_emitter {
+public:
+ jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_logical_or_emitter : public jit_emitter {
+public:
+ jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+
+class jit_logical_xor_emitter : public jit_emitter {
+public:
+ jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+class jit_logical_not_emitter : public jit_emitter {
+public:
+ jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+class jit_power_static_emitter : public jit_emitter {
+public:
+ jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ void register_table_entries() override;
+ size_t aux_vecs_count() const override;
+};
+
+class jit_prelu_emitter : public jit_emitter {
+public:
+ jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+private:
+ void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ template <mkldnn::impl::cpu::cpu_isa_t isa>
+ void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+
+ size_t aux_vecs_count() const override;
+};
+
+} // namespace MKLDNNPlugin
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common/emitter.h"
+#include "jit_mkldnn_emitters.hpp"
+#include "mkldnn_eltwise_node.h"
+#include "legacy/ie_layers.h"
+
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::cpu;
+using namespace Xbyak;
+
+namespace MKLDNNPlugin {
+
+jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, InferenceEngine::Precision exec_prc)
+ : jit_emitter(host, host_isa, node, exec_prc) {
+ auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(n);
+
+ auto alg = static_cast<mkldnn_alg_kind_t>(eltwiseNode.getAlgorithm());
+
+ if (host_isa_ == cpu::sse42) {
+ eltwise_injector_sse42 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::sse42>>(
+ host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+ } else if (host_isa_ == cpu::avx2) {
+ eltwise_injector_avx2 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx2>>(
+ host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+ } else if (host_isa_ == cpu::avx512_common) {
+ eltwise_injector_avx512_common = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx512_common>>(
+ host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+size_t jit_mkldnn_emitter::get_inputs_num() { return 1; }
+
+void jit_mkldnn_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
+ if (host_isa_ == cpu::sse42) {
+ if (out_vec_idxs[0] != in_vec_idxs[0])
+ h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0]));
+ eltwise_injector_sse42->compute_vector(out_vec_idxs[0]);
+ } else if (host_isa_ == cpu::avx2) {
+ if (out_vec_idxs[0] != in_vec_idxs[0])
+ h->uni_vmovups(Ymm(out_vec_idxs[0]), Ymm(in_vec_idxs[0]));
+ eltwise_injector_avx2->compute_vector(out_vec_idxs[0]);
+ } else if (host_isa_ == cpu::avx512_common) {
+ if (out_vec_idxs[0] != in_vec_idxs[0])
+ h->uni_vmovups(Zmm(out_vec_idxs[0]), Zmm(in_vec_idxs[0]));
+ eltwise_injector_avx512_common->compute_vector(out_vec_idxs[0]);
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+void jit_mkldnn_emitter::emit_table() {
+ if (host_isa_ == cpu::sse42) {
+ eltwise_injector_sse42->prepare_table();
+ } else if (host_isa_ == cpu::avx2) {
+ eltwise_injector_avx2->prepare_table();
+ } else if (host_isa_ == cpu::avx512_common) {
+ eltwise_injector_avx512_common->prepare_table();
+ } else {
+ assert(!"unsupported isa");
+ }
+}
+
+
+} // namespace MKLDNNPlugin
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "common/emitter.h"
+#include "jit_generator.hpp"
+#include "mkldnn_node.h"
+#include "jit_uni_eltwise.hpp"
+
+namespace MKLDNNPlugin {
+
+class jit_mkldnn_emitter : public jit_emitter {
+public:
+ jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
+ InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+ size_t get_inputs_num() override;
+
+ void emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+ const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
+
+ void emit_table() override;
+
+private:
+ std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::sse42>> eltwise_injector_sse42;
+ std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx2>> eltwise_injector_avx2;
+ std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx512_common>> eltwise_injector_avx512_common;
+};
+
+} // namespace MKLDNNPlugin
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_activation_node.h"
-#include "desc_iterator.hpp"
-#include <legacy/ie_layers.h>
-#include <algorithm>
-#include <string>
-#include <mkldnn_extension_utils.h>
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-using namespace InferenceEngine::details;
-
-// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu
-caseless_map<std::string, std::function<void(GenericLayer*, mkldnn::algorithm&, float&, float&)>> MKLDNNActivationNode::initializers = {
- {"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
- beta = 0.0f;
- algorithm = eltwise_relu;
- }},
- {"gelu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_gelu;
- }},
- {"elu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
- beta = 0.0f;
- algorithm = eltwise_elu;
- }},
- {"tanh", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_tanh;
- }},
- {"logistic", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_logistic;
- }},
- {"square", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_square;
- }},
- {"abs", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_abs;
- }},
- {"sqrt", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_sqrt;
- }},
- {"linear", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
- beta = activationLayer->GetParamAsFloat("beta", 0.0f);
- algorithm = eltwise_linear;
- }},
- {"bounded_relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("alpha", 0.0f);
- beta = 0.0f;
- algorithm = eltwise_bounded_relu;
- }},
- {"soft_relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_soft_relu;
- }},
- {"relu6", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("n", 6.0f);
- beta = 0.0f;
- algorithm = eltwise_bounded_relu;
- }},
- {"clamp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("max", 1.0f);
- beta = activationLayer->GetParamAsFloat("min", 0.0f);
- algorithm = eltwise_clamp;
- }},
- {"exp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_exp;
- }},
- {"not", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_not;
- }},
- {"swish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
- beta = 0.0f;
- algorithm = eltwise_swish;
- }},
- {"hswish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_hswish;
- }},
- {"mish", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_mish;
- }},
- {"hsigmoid", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
- alpha = 0.0f;
- beta = 0.0f;
- algorithm = eltwise_hsigmoid;
- }},
-};
-
-MKLDNNActivationNode::MKLDNNActivationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
- MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(layer, eng, cache) {
- GenericLayer* activationLayer = getCnnLayer().get();
- if (activationLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get CNNLayer.";
-
- std::string type = activationLayer->type;
- CaselessEq<std::string> comparator;
- if (comparator(type, "activation"))
- type = activationLayer->GetParamAsString("type");
- if (comparator(type, "sigmoid"))
- type = "logistic";
-
- if (initializers.find(type) != initializers.end())
- initializers[type](activationLayer, algorithm, alpha, beta);
-}
-
-void MKLDNNActivationNode::getSupportedDescriptors() {
- if (!descs.empty())
- return;
-
- if (getParentEdges().size() != 1)
- THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
- if (!getChildEdges().size())
- THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
-
- auto parentOutDims = getParentEdgeAt(0)->getDims();
-
- InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-
- // FIXME: MKLDNN doesn't support not inputs with number of dimensions less than 4 for activation
- while (parentOutDims.ndims() < 4)
- parentOutDims.push_back(1);
- for (auto format : getAvailableFormatsForDims(parentOutDims)) {
- MKLDNNMemoryDesc in_candidate(parentOutDims, MKLDNNExtensionUtils::IEPrecisionToDataType(precision), format);
- createDescriptor({in_candidate}, {});
- }
-}
-
-void MKLDNNActivationNode::createPrimitive() {
- if (prim)
- return;
-
- auto prim_desc = createPrimitiveDescriptor<eltwise_forward::primitive_desc, eltwise_forward::desc>();
-
- prim.reset(new eltwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
- getChildEdgeAt(0)->getMemory().GetPrimitive()));
-}
-
-bool MKLDNNActivationNode::created() const {
- return getType() == Activation;
-}
-
-void MKLDNNActivationNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
- const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
- MKLDNNMemoryDesc inDesc(inputDesc[0]);
- MKLDNNDescriptor desc(std::shared_ptr<eltwise_forward::desc>(
- new eltwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), inDesc, getAlpha(), getBeta())));
- descs.push_back(desc);
-}
-
-void MKLDNNActivationNode::initOptimalPrimitiveDescriptor() {
- auto config = getSelectedPrimitiveDescriptor()->getConfig();
- if (isInitConfig(config))
- return;
-
- if (config.inConfs.size() != 1 || config.outConfs.size() != 1 ||
- (!isUninitTensorDesc(config.inConfs[0].desc) &&
- !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc))
- THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!";
-
- if (!isUninitTensorDesc(config.inConfs[0].desc)) {
- config.outConfs[0].desc = config.inConfs[0].desc;
- } else if (!isUninitTensorDesc(config.outConfs[0].desc)) {
- config.inConfs[0].desc = config.outConfs[0].desc;
- } else {
- config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0);
- }
-
- initDescriptor(config);
-}
-
-MKLDNNMemoryDesc MKLDNNActivationNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
- InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
-
- auto parentOutDims = getParentEdgeAt(idx)->getDims().ToSizeVector();
-
- SizeVector blocked_dims, order, dimOffsets, strides;
- size_t offset = desc.getBlockingDesc().getOffsetPadding();
-
- for (size_t i = 0; i < desc.getBlockingDesc().getStrides().size(); i++) {
- if (desc.getBlockingDesc().getOrder()[i] >= parentOutDims.size())
- continue;
-
- blocked_dims.push_back(desc.getBlockingDesc().getBlockDims()[i]);
- order.push_back(desc.getBlockingDesc().getOrder()[i]);
- dimOffsets.push_back(desc.getBlockingDesc().getOffsetPaddingToData()[i]);
- strides.push_back(desc.getBlockingDesc().getStrides()[i]);
- }
- if (desc.getLayout() == InferenceEngine::Layout::ANY)
- return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
- parentOutDims,
- desc.getLayout()));
- else
- return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
- parentOutDims,
- {blocked_dims, order, offset, dimOffsets, strides}));
-}
-
-MKLDNNMemoryDesc MKLDNNActivationNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
- InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc());
-
- auto childInDims = getChildEdgeAt(idx)->getDims().ToSizeVector();
-
- SizeVector blocked_dims, order, dimOffsets, strides;
- size_t offset = desc.getBlockingDesc().getOffsetPadding();
-
- for (size_t i = 0; i < desc.getBlockingDesc().getStrides().size(); i++) {
- if (desc.getBlockingDesc().getOrder()[i] >= childInDims.size())
- continue;
-
- blocked_dims.push_back(desc.getBlockingDesc().getBlockDims()[i]);
- order.push_back(desc.getBlockingDesc().getOrder()[i]);
- dimOffsets.push_back(desc.getBlockingDesc().getOffsetPaddingToData()[i]);
- strides.push_back(desc.getBlockingDesc().getStrides()[i]);
- }
- if (desc.getLayout() == InferenceEngine::Layout::ANY)
- return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
- childInDims,
- desc.getLayout()));
- else
- return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
- childInDims,
- {blocked_dims, order, offset, dimOffsets, strides}));
-}
-
-REG_MKLDNN_PRIM_FOR(MKLDNNActivationNode, Activation);
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include "caseless.hpp"
-#include <string>
-#include <memory>
-#include <vector>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNActivationNode : public MKLDNNNode {
-public:
- MKLDNNActivationNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
- ~MKLDNNActivationNode() override = default;
-
- void getSupportedDescriptors() override;
- void initOptimalPrimitiveDescriptor() override;
- void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
- const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
- void createPrimitive() override;
- bool created() const override;
-
- mkldnn::algorithm getAlgorithm() const { return algorithm; }
- float getAlpha() const { return alpha; }
- float getBeta() const { return beta; }
-
- MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
- MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
-
-private:
- float alpha = 0.0f;
- float beta = 0.0f;
- static InferenceEngine::details::caseless_map<std::string,
- std::function<void(InferenceEngine::GenericLayer*, mkldnn::algorithm&, float&, float&)>> initializers;
- mkldnn::algorithm algorithm = mkldnn::algorithm::eltwise_relu;
-};
-
-} // namespace MKLDNNPlugin
-
//
#include "mkldnn_batchnorm_node.h"
-#include "mkldnn_depthwise_node.h"
#include <mkldnn_extension_utils.h>
#include "common/cpu_memcpy.h"
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void createPrimitive() override;
bool created() const override;
- bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Depthwise
+ bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Eltwise
&& fusedWith[0]->getCnnLayer()->type == "ScaleShift";}
MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
#include "mkldnn_bin_conv_node.h"
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
-#include "mkldnn_activation_node.h"
-#include "desc_iterator.hpp"
#include "mkldnn_eltwise_node.h"
-#include "mkldnn_depthwise_node.h"
+#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
#include "mkldnn_conv_node.h"
#include <legacy/ie_layers.h>
paddingR[i] = (dst - calc_dst) * stride[i];
}
- withSum = isFusedWith(Eltwise);
withDWConv = isFusedWith(Convolution);
withBinarization = isFusedWith(Quantize);
for (auto &node : fusedWith) {
#endif
}
- int expectedInputEdgesNum = baseInputsNumber + isFusedWith(Eltwise);
+ withSum = false;
+ int expectedInputEdgesNum = baseInputsNumber;
for (int i = 0; i < fusedWith.size(); i++) {
auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
if (convolutionNode) {
expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
}
+
+ auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
+ if (eltwiseNode && eltwiseNode->isSum()) {
+ withSum = true;
+ expectedInputEdgesNum++;
+ }
}
if (getParentEdges().size() != expectedInputEdgesNum)
for (auto &node : fusedWith) {
#if defined (COMPILED_CPU_MKLDNN_ELTWISE_NODE)
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
- if (eltwiseNode) {
- if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
- auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
- if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
- // currently there is the only one scale while we need scale by channel :(
- ops.append_sum(it->second->buffer().as<float*>()[0]);
- }
- } else {
- ops.append_sum(1.0);
- }
+ if (eltwiseNode && eltwiseNode->isSum()) {
+ ops.append_sum(1.0);
continue;
}
-#endif
-
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(),
- activationNode->getBeta());
- continue;
- }
-#endif
-
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-
- if (initWeights) {
- MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});
-
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
- PostOpsIntBlobMemory[blob_idx]->FillZero();
-
- PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_weights->buffer(),
- depthwiseLayer->_weights->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
- }
- }
-
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
- memory::format::x);
- PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
- PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_biases->buffer(),
- depthwiseLayer->_biases->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
- }
- }
-
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
- blob_idx += 2;
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- nullptr);
-
- blob_idx += 1;
- }
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- nullptr,
- nullptr);
- }
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
continue;
}
#endif
#include "mkldnn_conv_node.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_pooling_node.h"
+#include "mkldnn_eltwise_node.h"
#include <limits>
#include "common/cpu_memcpy.h"
MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
- bool hasEltwise = false;
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
- if (parentEdge->getParent()->getType() == Eltwise)
- hasEltwise = true;
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
config.outConfs.resize(1);
config.outConfs[0].inPlace = -1;
config.outConfs[0].constant = false;
- if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1 || hasEltwise) {
+ if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format::nc :
dims.ndims() == 4 ? memory::format::nhwc :
memory::format::ndhwc
}
}
- if (axis != 1 || hasEltwise)
+ if (axis != 1)
return;
auto numOfDim = static_cast<size_t>(dstDims.ndims());
#include "mkldnn_conv_node.h"
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
-#include "mkldnn_activation_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_eltwise_node.h"
-#include "mkldnn_depthwise_node.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_pooling_node.h"
#include "mkldnn_concat_node.h"
if (convLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert convolution layer.";
+ withSum = false;
+ int expectedInputEdgesNum = baseInputsNumber;
+ for (int i = 0; i < fusedWith.size(); i++) {
+ auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
+ if (convolutionNode) {
+ expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
+ }
+
+ auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
+ if (eltwiseNode && eltwiseNode->isSum()) {
+ withSum = true;
+ expectedInputEdgesNum++;
+ }
+ }
+
auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
if (!inputZeroPoints.empty())
inputDataType = memory::u8;
// We need to make sure that convolution output and second input of fused Eltwise operation
// have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
- if (outputDataType != memory::f32 && outputDataType != memory::bf16 && isFusedWith(Eltwise)) {
+ if (outputDataType != memory::f32 && outputDataType != memory::bf16 && withSum) {
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
- if (eltwiseNode) {
+ if (eltwiseNode && eltwiseNode->isSum()) {
eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
eltwisePrecision = Precision::FP32;
}
}
- int expectedInputEdgesNum = baseInputsNumber + isFusedWith(Eltwise);
- for (int i = 0; i < fusedWith.size(); i++) {
- auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
- if (convolutionNode) {
- expectedInputEdgesNum += convolutionNode->getBaseIntputsNumber() - 1;
- }
- }
-
if (getParentEdges().size() != expectedInputEdgesNum)
THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
if (getChildEdges().empty())
MKLDNNDims weightsDims = MKLDNNDims(weightDims);
- withSum = isFusedWith(Eltwise);
withDWConv = isFusedWith(Convolution);
for (int i = 0; i < fusedWith.size(); i++) {
eltwisePrecision = Precision::FP32;
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
- if (eltwiseNode) {
+ if (eltwiseNode && eltwiseNode->isSum()) {
eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
// TODO(amalyshe): there might be situation when convolution can be executed in BF16,
// output is required in FP32 but eltwise inplace tensor would be in BF16
if (node->getType() == Split || node->getType() == Concatenation)
continue;
-#if defined (COMPILED_CPU_MKLDNN_ELTWISE_NODE)
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
- if (eltwiseNode) {
- if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
- auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
- if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
- // currently there is the only one scale while we need scale by channel :(
- ops.append_sum(it->second->buffer().as<float*>()[0], mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision)));
- }
- } else {
+ if (eltwiseNode && eltwiseNode->isSum()) {
ops.append_sum(1.0, mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision)));
- }
-
continue;
}
-#endif
-
-#if defined(COMPILED_CPU_MKLDNN_ACTIVATION_NODE)
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(),
- activationNode->getBeta());
- continue;
- }
-#endif
-
-#if defined (COMPILED_CPU_MKLDNN_DEPTHWISE_NODE)
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
-
- if (initWeights) {
- MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});
-
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
- PostOpsIntBlobMemory[blob_idx]->FillZero();
- PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_weights->buffer(),
- depthwiseLayer->_weights->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
- }
- }
-
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
- memory::format::x);
- PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
- PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_biases->buffer(),
- depthwiseLayer->_biases->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
- }
- }
-
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
- blob_idx += 2;
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- nullptr);
-
- blob_idx += 1;
- }
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- nullptr,
- nullptr);
- }
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
continue;
}
-#endif
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
if (quantizeNode) {
#include "mkldnn_def_conv_node.h"
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
-#include "mkldnn_activation_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_eltwise_node.h"
-#include "mkldnn_depthwise_node.h"
#include <legacy/ie_layers.h>
#include <string>
#include <vector>
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_depthwise_node.h"
-#include "desc_iterator.hpp"
-#include <legacy/ie_layers.h>
-#include <string>
-#include <vector>
-#include <mkldnn_types.h>
-#include <mkldnn_extension_utils.h>
-#include "caseless.hpp"
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-using namespace InferenceEngine::details;
-
-MKLDNNDepthwiseNode::MKLDNNDepthwiseNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
- : MKLDNNNode(layer, eng, cache) {
- internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
- return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
- });
- internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
- if (!isWithBiases())
- return MKLDNNMemoryDesc();
- return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc());
- });
-}
-
-void MKLDNNDepthwiseNode::getSupportedDescriptors() {
- InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
- auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
- auto parentOutDims = getParentEdgeAt(0)->getDims();
-
- if (getParentEdges().size() != 1)
- THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect number of inputs!";
- if (parentOutDims != getChildEdgeAt(0)->getDims())
- THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect dimensions!";
-
- auto size = static_cast<size_t>(parentOutDims.ndims() == 1 ? parentOutDims[0] : parentOutDims[1]);
- SizeVector weightDims = { size };
- MKLDNNDims blocked_weightDims(weightDims);
-
- auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get());
- if (wLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << ".";
-
- InferenceEngine::Blob::Ptr blb = wLayer->_weights;
- if (blb)
- realWeightSize = blb->size();
- internalBlobs.push_back(createInternalBlob(weightDims, true));
- if (isWithBiases()) {
- InferenceEngine::Blob::Ptr blb = wLayer->_biases;
- if (blb)
- realBiasSize = blb->size();
- internalBlobs.push_back(createInternalBlob(weightDims, false));
- }
-
- for (auto format : getAvailableFormatsForDims(parentOutDims)) {
- MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format};
- createDescriptor({in_candidate}, {});
- }
-}
-
-void MKLDNNDepthwiseNode::initSupportedPrimitiveDescriptors() {
- if (!supportedPrimitiveDescriptors.empty())
- return;
-
- auto parentOutDims = getParentEdgeAt(0)->getDims();
- if (parentOutDims.ndims() <= 5) {
- MKLDNNNode::initSupportedPrimitiveDescriptors();
- } else {
- createSpecificDescriptor5D();
- if (specificDesc5DPtr == nullptr)
- THROW_IE_EXCEPTION << "Cannot create specific MKLDNNDescriptor for depthwise node " << getName();
- const auto& desc = *specificDesc5DPtr;
- auto itpd = desc.createPrimitiveDescriptorIterator(getEngine());
- while (itpd.is_not_end()) {
- InferenceEngine::LayerConfig config;
- config.dynBatchSupport = true;
- for (size_t i = 0; i < descInputNumbers(desc); i++) {
- InferenceEngine::DataConfig dataConfig;
- dataConfig.inPlace = -1;
- dataConfig.constant = false;
- dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY));
- config.inConfs.push_back(dataConfig);
- }
-
- std::vector<mkldnn::memory::format> outFormats;
- for (size_t i = 0; i < descOutputNumbers(desc); i++) {
- InferenceEngine::DataConfig dataConfig;
- dataConfig.inPlace = canBeInPlace() ? 0 : -1;
- dataConfig.constant = false;
- dataConfig.desc = MKLDNNMemoryDesc(InferenceEngine::TensorDesc(Precision::FP32, parentOutDims.ToSizeVector(), Layout::ANY));
- config.outConfs.push_back(dataConfig);
-
- auto primDesc = itpd.fetch();
- auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0);
- if (dstPrimDesc) {
- outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
- } else {
- // This path is needed to correctly handle Deconvolution node
- auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0);
- if (diffSrcPrimDesc) {
- outFormats.emplace_back(static_cast<memory::format>(itpd.diff_src_primitive_desc().desc().data.format));
- }
- }
- }
- impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
-
- supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
- itpd++;
- }
- }
-}
-
-void MKLDNNDepthwiseNode::createPrimitive() {
- if (prim)
- return;
-
- auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
- auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
- if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
- THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
- if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
- THROW_IE_EXCEPTION << "Input memory didn't allocate.";
- if (getSelectedPrimitiveDescriptor() == nullptr)
- THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
-
- auto createRightPrimitiveDescriptor = [&]() -> depthwise_forward::primitive_desc {
- auto parentOutDims = getParentEdgeAt(0)->getDims();
- if (parentOutDims.ndims() <= 5) {
- return createPrimitiveDescriptor<depthwise_forward::primitive_desc, depthwise_forward::desc>();
- } else {
- const PrimitiveDescInfo *selected_pd = getSelectedPrimitiveDescriptor();
- auto& desc = *specificDesc5DPtr;
- auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), mkldnn::primitive_attr());
-
- while (itpd.is_not_end()) {
- impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
- if (impl_type == getSelectedPrimitiveDescriptor()->getImplementationType()) {
- specificPrepareMemory5D(itpd);
- std::shared_ptr<depthwise_forward::desc> selected_desc_ptr = desc;
- depthwise_forward::primitive_desc prim_desc = depthwise_forward::primitive_desc(*selected_desc_ptr, getEngine());
- return prim_desc;
- }
- itpd++;
- }
- THROW_IE_EXCEPTION << "Cannot create specific primitive descriptor for depthwise node " << getName() << ".";
- }
- };
-
- auto prim_desc = createRightPrimitiveDescriptor();
-
- if (isBroadcast()) {
- float broadcastValue = static_cast<float*>(internalBlobMemory[0]->GetData())[0];
- size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
- for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) {
- static_cast<float*>(internalBlobMemory[0]->GetData())[i] = broadcastValue;
- }
-
- if (isWithBiases()) {
- blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
- broadcastValue = static_cast<float*>(internalBlobMemory[1]->GetData())[0];
- for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) {
- static_cast<float*>(internalBlobMemory[1]->GetData())[i] = broadcastValue;
- }
- }
- } else {
- size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
- if (realWeightSize != blbSize)
- THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect weights!";
- if (isWithBiases()) {
- blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
- if (realBiasSize != blbSize)
- THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect biases!";
- }
- }
-
- if (isWithBiases()) {
- prim.reset(new depthwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
- internalBlobMemory[0]->GetPrimitive(),
- internalBlobMemory[1]->GetPrimitive(),
- getChildEdgeAt(0)->getMemory().GetPrimitive()));
- } else {
- prim.reset(new depthwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
- internalBlobMemory[0]->GetPrimitive(),
- getChildEdgeAt(0)->getMemory().GetPrimitive()));
- }
-}
-
-bool MKLDNNDepthwiseNode::created() const {
- return getType() == Depthwise;
-}
-
-void MKLDNNDepthwiseNode::init() {
- GenericLayer* depthwiseLayer = getCnnLayer().get();
- if (depthwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get CNNLayer.";
-
- CaselessEq<std::string> comparator;
- if (comparator(depthwiseLayer->type, "ScaleShift")) {
- auto *scshLayer = dynamic_cast<ScaleShiftLayer*>(getCnnLayer().get());
- if (scshLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get scale shift layer " << getName();
- if (scshLayer->_weights == nullptr)
- THROW_IE_EXCEPTION << "ScaleShift without weights is not supported";
-
- algorithm = depthwise_scale_shift;
- withBiases = scshLayer->_biases != nullptr;
- broadcast = static_cast<bool>(scshLayer->_broadcast);
- } else if (comparator(depthwiseLayer->type, "PReLU")) {
- auto *preluLayer = dynamic_cast<PReLULayer*>(getCnnLayer().get());
- if (preluLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get PReLU layer " << getName();
- if (preluLayer->_weights == nullptr)
- THROW_IE_EXCEPTION << "PReLU without weights is not supported";
-
- algorithm = depthwise_prelu;
- withBiases = false;
- broadcast = preluLayer->_channel_shared;
- } else {
- THROW_IE_EXCEPTION << "Unsupported depthwise operation";
- }
-}
-
-void MKLDNNDepthwiseNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
- const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
- MKLDNNMemoryDesc in_candidate(inputDesc[0]);
- MKLDNNMemoryDesc out_candidate(inputDesc[0]);
- MKLDNNDims weightDims({in_candidate.getDims().ndims() == 1 ? in_candidate.getDims()[0] : in_candidate.getDims()[1]});
-
- MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x};
-
- if (isWithBiases()) {
- MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x};
- MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
- new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate)));
- descs.push_back(desc);
- } else {
- MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
- new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate)));
- descs.push_back(desc);
- }
-}
-
-void MKLDNNDepthwiseNode::initOptimalPrimitiveDescriptor() {
- auto selected_pd = getSelectedPrimitiveDescriptor();
- if (selected_pd == nullptr)
- THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
- auto config = selected_pd->getConfig();
- if (isInitConfig(config))
- return;
-
- if (config.inConfs.size() != 1 || config.outConfs.size() != 1 || (!isUninitTensorDesc(config.inConfs[0].desc) &&
- !isUninitTensorDesc(config.outConfs[0].desc) && config.inConfs[0].desc != config.outConfs[0].desc))
- THROW_IE_EXCEPTION << "Layer " << getName() << " has incorrect selected config!";
-
- if (getParentEdgeAt(0)->getDims().ndims() > 5)
- return;
-
- if (!isUninitTensorDesc(config.inConfs[0].desc)) {
- config.outConfs[0].desc = config.inConfs[0].desc;
- } else if (!isUninitTensorDesc(config.outConfs[0].desc)) {
- config.inConfs[0].desc = config.outConfs[0].desc;
- } else {
- config.outConfs[0].desc = config.inConfs[0].desc = getConfiguredInputDesc(config, 0);
- }
-
- initDescriptor(config);
-}
-
-void MKLDNNDepthwiseNode::createSpecificDescriptor5D() {
- auto parentOutDims = getParentEdgeAt(0)->getDims();
- MKLDNNDims newDims;
- for (int i = 0; i < 4; i++)
- newDims.push_back(parentOutDims[i]);
- int lastDim = 1;
- for (int i = 4; i < parentOutDims.ndims(); i++) {
- lastDim *= parentOutDims[i];
- }
- newDims.push_back(lastDim);
-
- MKLDNNMemoryDesc in_candidate{newDims, MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32), mkldnn::memory::ncdhw};
- MKLDNNMemoryDesc out_candidate(in_candidate);
- MKLDNNDims weightDims({in_candidate.getDims()[1]});
-
- MKLDNNMemoryDesc wgh_candidate{weightDims, in_candidate.getDataType(), memory::x};
-
- if (isWithBiases()) {
- MKLDNNMemoryDesc bias_candidate{weightDims, in_candidate.getDataType(), memory::x};
- MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
- new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate, bias_candidate)));
- specificDesc5DPtr = std::make_shared<MKLDNNDescriptor>(desc);
- } else {
- MKLDNNDescriptor desc(std::shared_ptr<depthwise_forward::desc>(
- new depthwise_forward::desc(prop_kind::forward_scoring, getAlgorithm(), in_candidate, out_candidate, wgh_candidate)));
- specificDesc5DPtr = std::make_shared<MKLDNNDescriptor>(desc);
- }
-}
-
-void MKLDNNDepthwiseNode::specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd) {
- std::vector<MKLDNNMemoryDesc> intDescs;
- for (auto &it : internalBlobDesc)
- intDescs.push_back(it(itpd, 0));
-
- internalBlobMemory.clear();
- for (size_t i = 0; i < internalBlobs.size(); i++) {
- const auto &internalBlob = internalBlobs[i];
-
- auto create = [&] () {
- auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
- auto newFormat = newDesc.getFormat();
- if (newFormat == mkldnn::memory::ncdhw) {
- newFormat = mkldnn::memory::goihw;
- }
- if (newFormat == mkldnn::memory::nchw) {
- newFormat = mkldnn::memory::oihw;
- }
-
- MKLDNNMemory memory{ getEngine() };
- memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
-
- MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()));
- _ptr->Create(intDescs[i]);
- _ptr->SetData(memory);
-
- return _ptr;
- };
-
- MKLDNNMemoryPtr ptr;
- if (weightCache != nullptr) {
- const uint64_t data_hash = weightCache->GetHashFunc().hash(
- internalBlob->buffer(), internalBlob->byteSize());
-
- const std::string string_hash = getName() + "_" + std::to_string(i)
- + "_" + std::to_string(internalBlob->byteSize())
- + "_" + std::to_string(data_hash);
-
- ptr = weightCache->findOrCreate(string_hash, create);
- } else {
- ptr = create();
- }
-
- internalBlobMemory.push_back(ptr);
- }
-}
-
-REG_MKLDNN_PRIM_FOR(MKLDNNDepthwiseNode, Depthwise);
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include <string>
-#include <memory>
-#include <vector>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNDepthwiseNode : public MKLDNNNode {
-public:
- MKLDNNDepthwiseNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
- ~MKLDNNDepthwiseNode() override = default;
-
- void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
- const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
- void initOptimalPrimitiveDescriptor() override;
- void getSupportedDescriptors() override;
- void initSupportedPrimitiveDescriptors() override;
- void createPrimitive() override;
- bool created() const override;
-
- mkldnn::algorithm getAlgorithm() const { return algorithm; }
- bool isWithBiases() const { return withBiases; }
- bool isBroadcast() const { return broadcast; }
-
-private:
- void init() override;
-
- mkldnn::algorithm algorithm = mkldnn::algorithm::depthwise_scale_shift;
- size_t realWeightSize = 0;
- size_t realBiasSize = 0;
- bool withBiases = false;
- bool broadcast = false;
-
- std::shared_ptr<MKLDNNDescriptor> specificDesc5DPtr;
- void createSpecificDescriptor5D();
- void specificPrepareMemory5D(mkldnn::primitive_desc_iterator& itpd);
-};
-
-} // namespace MKLDNNPlugin
#include <mkldnn_extension_utils.h>
#include "ie_parallel.hpp"
#include "mkldnn_quantize_node.h"
-#include "mkldnn_activation_node.h"
#include <map>
#include "jit_uni_eltwise.hpp"
#include "jit_uni_quantization.hpp"
+#include "common/emitter.h"
+#include "jit_eltwise_emitters.hpp"
+#include "jit_mkldnn_emitters.hpp"
+#include "ref_eltwise.hpp"
+#include "mkldnn_pooling_node.h"
-using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
-using namespace mkldnn::impl;
-using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::utils;
+
+using namespace mkldnn::impl::cpu;
using namespace Xbyak;
-#define GET_OFF(field) offsetof(jit_eltwise_fq_call_args, field)
+#define GET_OFF(field) offsetof(jit_eltwise_call_args, field)
template <cpu_isa_t isa>
-struct jit_uni_eltwise_fq_generic : public jit_uni_eltwise_fq_kernel, public jit_generator {
- DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_fq_generic)
-
- explicit jit_uni_eltwise_fq_generic(jit_eltwise_fq_params jep, const mkldnn_primitive_attr &attr) : jit_uni_eltwise_fq_kernel(jep, attr), jit_generator() {
- const auto &p = attr_.post_ops_;
- for (int i = 0; i < p.len_; i++) {
- auto &post_op = p.entry_[i];
- if (post_op.is_eltwise()) {
- eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
- this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
- } else if (post_op.is_quantization()) {
+struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_generator {
+ DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic)
+
+ explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {
+ Precision exec_prc = Precision::UNSPECIFIED;
+
+ std::set<Precision> supported_precision_intersection = get_supported_precisions(eltwiseNode);
+ for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
+ if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
+ std::set<Precision> prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get());
+
+ std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(),
+ prcs.begin(), prcs.end(), std::inserter(supported_precision_intersection, supported_precision_intersection.begin()));
+ }
+ }
+
+ for (auto prc : exec_precisions_priority) {
+ if (std::find(supported_precision_intersection.begin(), supported_precision_intersection.end(), prc) != supported_precision_intersection.end()) {
+ exec_prc = prc;
+ break;
+ }
+ }
+
+ for (int i = 0; i < jep_.inputs_number; i++) {
+ if (jep_.src_prc[i] != exec_prc) {
+ exec_prc = Precision::FP32;
+ break;
+ }
+ }
+
+ if (exec_prc == Precision::UNSPECIFIED) {
+ THROW_IE_EXCEPTION << "Eltwise jitter failed to specify execution precision for Eltwise node with name `" << eltwiseNode.getName() << "`";
+ }
+
+ eltwise_emitter = create_eltwise_emitter(eltwiseNode, exec_prc);
+
+ mkldnn::post_ops post_ops;
+ for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
+ if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
+ post_op_emitters.push_back(create_eltwise_emitter(*eltwiseNode.getFusedWith()[i].get(), exec_prc));
+ } else if (eltwiseNode.getFusedWith()[i].get()->getType() == Quantize) {
+ auto quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(eltwiseNode.getFusedWith()[i].get());
+ quantizeNode->appendPostOps(post_ops);
+
quantization_injectors.push_back(std::make_shared<jit_uni_quantization_injector_f32<isa>>(
- this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
+ this, post_ops.get()->entry_[post_ops.get()->len_ - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
}
}
this->preamble();
- mov(reg_src0, ptr[reg_params + GET_OFF(src0)]);
- mov(reg_src1, ptr[reg_params + GET_OFF(src1)]);
+ for (int i = 0; i < jep.inputs_number; i++)
+ mov(get_src_reg(i), ptr[reg_params + GET_OFF(src_ptr[0]) + i * sizeof(size_t)]);
mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
- xor_(reg_oc_off, reg_oc_off);
+ mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
+ Xbyak::Label unroll_loop_label;
+ Xbyak::Label unroll_loop_end_label;
Xbyak::Label main_loop_label;
Xbyak::Label main_loop_end_label;
Xbyak::Label tail_loop_label;
if (isa == avx512_common)
vpxord(vmm_zero, vmm_zero, vmm_zero);
- if (jep.src0_step == 0)
- uni_vbroadcastss(vmm_src0, ptr[reg_src0]);
- if (jep.src1_step == 0)
- uni_vbroadcastss(vmm_src1, ptr[reg_src1]);
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] == 1)
+ load_vector(get_vmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc, true);
+ }
- L(main_loop_label);
- {
- cmp(reg_work_amount, simd_w);
- jl(main_loop_end_label, T_NEAR);
-
- if (jep.src0_step != 0)
- load_vector(vmm_src0, ptr[reg_src0], jep.src0_dt);
- if (jep.src1_step != 0)
- load_vector(vmm_src1, ptr[reg_src1], jep.src1_dt);
-
- switch (jep.eltwise_op) {
- case EltwiseLayer::eOperation::Sum:
- if (isa == cpu::sse42) {
- uni_vmovups(vmm_dst, vmm_src0);
- uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
- } else {
- uni_vaddps(vmm_dst, vmm_src0, vmm_src1);
- }
- break;
- case EltwiseLayer::eOperation::Prod:
- if (isa == cpu::sse42) {
- uni_vmovups(vmm_dst, vmm_src0);
- uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
- } else {
- uni_vmulps(vmm_dst, vmm_src0, vmm_src1);
- }
- break;
- default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
+ size_t min_src_size = jep.dst_size;
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] != 1)
+ min_src_size = std::min(min_src_size, jep.src_size[i]);
+ }
+ if (jep_.oc_size > 1)
+ min_src_size = std::min(min_src_size, jep_.oc_size);
+
+ if (min_src_size != jep.dst_size) {
+ bool is_valid_configuration = true;
+ if (jep.dst_size % min_src_size != 0)
+ is_valid_configuration = false;
+
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] != 1 && jep.src_size[i] != min_src_size && jep.src_size[i] != jep.dst_size)
+ is_valid_configuration = false;
}
- int eltwise_inj_idx = 0;
- int quantization_inj_idx = 0;
- for (int i = 0; i < p.len_; i++) {
- auto &post_op = p.entry_[i];
- if (post_op.is_eltwise()) {
- eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1);
- eltwise_inj_idx++;
- } else if (post_op.is_quantization()) {
- bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
- bool do_rounding = do_dequantization || jep_.dst_dt == data_type::f32 || i != p.len_ - 1;
- int s_idx = vmm_dst.getIdx();
+ if (jep_.oc_size > 1 && jep_.oc_size != min_src_size && jep_.oc_size != jep.dst_size)
+ is_valid_configuration = false;
+
+ if (!is_valid_configuration)
+ THROW_IE_EXCEPTION << "Eltwise jitter has invalid configuration for Eltwise node with name `" << eltwiseNode.getName() << "`";
+
+ L(unroll_loop_label);
+ {
+ size_t loop_step = min_src_size;
+ size_t vec_step = cpu_isa_traits<isa>::vlen / exec_prc.size();
- quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
- quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0);
+ cmp(reg_work_amount, loop_step);
+ jl(unroll_loop_end_label, T_NEAR);
- quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off);
- quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding);
+ for (int j = 0; j < min_src_size / vec_step; j++) {
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] != 1)
+ load_vector(get_vmm_reg(i), ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], jep.src_prc[i], exec_prc, false);
+ }
+
+ compute_eltwise_op();
- quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off);
- quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0);
+ apply_post_ops(false, jep_.oc_size > 1 ? j * vec_step * sizeof(float) : 0);
- quantization_inj_idx++;
+ store_vector(ptr[reg_dst + j * vec_step * jep.dst_prc.size()], vmm_dst, exec_prc, jep.dst_prc);
}
- }
- store_vector(ptr[reg_dst], vmm_dst, jep.dst_dt);
+ int tail_start = min_src_size - min_src_size % vec_step;
+ for (int j = tail_start; j < min_src_size; j++) {
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] != 1)
+ load_scalar(get_xmm_reg(i), ptr[get_src_reg(i) + j * jep.src_prc[i].size()], jep.src_prc[i], exec_prc);
+ }
- if (jep.src0_step != 0)
- add(reg_src0, jep.src0_step * jep.src0_data_size * simd_w);
- if (jep.src1_step != 0)
- add(reg_src1, jep.src1_step * jep.src1_data_size * simd_w);
- add(reg_dst, jep.dst_step * jep.dst_data_size * simd_w);
- sub(reg_work_amount, simd_w);
- add(reg_oc_off, simd_w * sizeof(float));
+ compute_eltwise_op();
- jmp(main_loop_label, T_NEAR);
- }
+ apply_post_ops(true, jep_.oc_size > 1 ? j * sizeof(float) : 0);
- L(main_loop_end_label);
+ store_scalar(ptr[reg_dst + j * jep.dst_prc.size()], xmm_dst, exec_prc, jep.dst_prc);
+ }
- L(tail_loop_label);
- {
- cmp(reg_work_amount, 1);
- jl(tail_loop_end_label, T_NEAR);
+ for (int i = 0; i < jep.inputs_number; i++)
+ if (jep.src_size[i] == jep.dst_size)
+ add(get_src_reg(i), jep.src_prc[i].size() * loop_step);
- if (jep.src0_step != 0)
- load_scalar(xmm_src0, ptr[reg_src0], jep.src0_dt);
- if (jep.src1_step != 0)
- load_scalar(xmm_src1, ptr[reg_src1], jep.src1_dt);
+ add(reg_dst, jep.dst_prc.size() * loop_step);
+ sub(reg_work_amount, loop_step);
+ if (jep_.oc_size > 1 && jep_.oc_size != min_src_size)
+ add(reg_oc_off, loop_step * sizeof(float));
- switch (jep.eltwise_op) {
- case EltwiseLayer::eOperation::Sum: uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break;
- case EltwiseLayer::eOperation::Prod: uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break;
- default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
+ jmp(unroll_loop_label, T_NEAR);
}
- int eltwise_inj_idx = 0;
- int quantization_inj_idx = 0;
- for (int i = 0; i < p.len_; i++) {
- auto &post_op = p.entry_[i];
- if (post_op.is_eltwise()) {
- eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1);
- eltwise_inj_idx++;
- } else if (post_op.is_quantization()) {
- bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
- bool do_rounding = do_dequantization || jep_.dst_dt == data_type::f32 || i != p.len_ - 1;
- int s_idx = vmm_dst.getIdx();
-
- quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
- quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, true);
+ L(unroll_loop_end_label);
+ }
- quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off);
- quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, true);
+ if (min_src_size == jep.dst_size) {
+ L(main_loop_label);
+ {
+ size_t loop_step = cpu_isa_traits<isa>::vlen / exec_prc.size();
- quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off);
- quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, true);
+ cmp(reg_work_amount, loop_step);
+ jl(main_loop_end_label, T_NEAR);
- quantization_inj_idx++;
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] != 1)
+ load_vector(get_vmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc, false);
}
+
+ compute_eltwise_op();
+
+ apply_post_ops(false);
+
+ store_vector(ptr[reg_dst], vmm_dst, exec_prc, jep.dst_prc);
+
+ for (int i = 0; i < jep.inputs_number; i++)
+ if (jep.src_size[i] != 1)
+ add(get_src_reg(i), jep.src_prc[i].size() * loop_step);
+
+ add(reg_dst, jep.dst_prc.size() * loop_step);
+ sub(reg_work_amount, loop_step);
+ if (jep_.oc_size > 1)
+ add(reg_oc_off, loop_step * sizeof(float));
+
+ jmp(main_loop_label, T_NEAR);
+ }
+
+ L(main_loop_end_label);
+ }
+
+ L(tail_loop_label);
+ {
+ size_t loop_step = 1;
+
+ cmp(reg_work_amount, loop_step);
+ jl(tail_loop_end_label, T_NEAR);
+
+ for (int i = 0; i < jep.inputs_number; i++) {
+ if (jep.src_size[i] != 1)
+ load_scalar(get_xmm_reg(i), ptr[get_src_reg(i)], jep.src_prc[i], exec_prc);
}
- store_scalar(ptr[reg_dst], xmm_dst, jep.dst_dt);
+ compute_eltwise_op();
+
+ apply_post_ops(true);
+
+ store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc);
- if (jep.src0_step != 0)
- add(reg_src0, jep.src0_step * jep.src0_data_size);
- if (jep.src1_step != 0)
- add(reg_src1, jep.src1_step * jep.src1_data_size);
- add(reg_dst, jep.dst_step * jep.dst_data_size);
- sub(reg_work_amount, 1);
- add(reg_oc_off, 1 * sizeof(float));
+ for (int i = 0; i < jep.inputs_number; i++)
+ if (jep.src_size[i] != 1)
+ add(get_src_reg(i), jep.src_prc[i].size() * loop_step);
+
+ add(reg_dst, jep.dst_prc.size() * loop_step);
+ sub(reg_work_amount, loop_step);
+ if (jep_.oc_size > 1)
+ add(reg_oc_off, loop_step * sizeof(float));
jmp(tail_loop_label, T_NEAR);
}
this->postamble();
- for (auto& inj : eltwise_injectors)
- inj->prepare_table();
+ eltwise_emitter->emit_table();
+ for (int i = 0; i < post_op_emitters.size(); i++) {
+ post_op_emitters[i]->emit_table();
+ }
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
- const int simd_w = cpu_isa_traits<isa>::vlen / sizeof(float);
+ Reg64 get_src_reg(int idx) {
+ return Reg64(r8.getIdx() + idx);
+ }
+
+ Vmm get_vmm_reg(int idx) {
+ return Vmm(1 + idx);
+ }
+
+ Vmm get_aux_vmm(int idx) {
+ return Vmm(10 + idx);
+ }
+
+ Xmm get_xmm_reg(int idx) {
+ return Xmm(get_vmm_reg(idx).getIdx());
+ }
+
+ Reg64 reg_dst = rbx;
+ Reg64 reg_work_amount = rdx;
- Reg64 reg_src0 = r8;
- Reg64 reg_src1 = r9;
- Reg64 reg_dst = r10;
- Reg64 reg_work_amount = r11;
- Reg64 reg_oc_off = r13;
+ Reg64 reg_oc_off = abi_not_param1;
Reg64 reg_params = abi_param1;
- Reg8 reg_tmp_8 = r12b;
- Reg32 reg_tmp_32 = r12d;
- Reg64 reg_tmp_64 = r12;
+ Reg8 reg_tmp_8 = Reg8(r15.getIdx());
+ Reg32 reg_tmp_32 = Reg32(r15.getIdx());
+ Reg64 reg_tmp_64 = Reg64(r15.getIdx());
- Reg64 reg_d_weights = r14;
- Reg64 reg_d_bias = r15;
+ Reg64 reg_d_weights = rbp;
+ Reg64 reg_d_bias = rsi;
- Vmm vmm_src0 = Vmm(0);
- Vmm vmm_src1 = Vmm(1);
- Vmm vmm_dst = Vmm(2);
- Xmm xmm_src0 = Xmm(0);
- Xmm xmm_src1 = Xmm(1);
- Xmm xmm_dst = Xmm(2);
+ Vmm vmm_dst = Vmm(9);
+ Xmm xmm_dst = Xmm(9);
- Vmm vmm_d_weights = Vmm(3);
- Vmm vmm_d_bias = Vmm(4);
+ Vmm vmm_d_weights = Vmm(12);
+ Vmm vmm_d_bias = Vmm(13);
+ Vmm vmm_zero = Vmm(15);
- Vmm vmm_zero = Vmm(5);
+ std::shared_ptr<jit_emitter> eltwise_emitter = nullptr;
+ std::vector<std::shared_ptr<jit_emitter>> post_op_emitters = {};
- std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
- std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
+ std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors = {};
- inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
- switch (src_dt) {
- case memory::f32:
- case memory::s32:
- uni_vmovups(vmm_src, op);
- break;
- case memory::s8:
- uni_vpmovsxbd(vmm_src, op);
- break;
- case memory::u8:
- uni_vpmovzxbd(vmm_src, op);
- break;
- default:
- assert(!"unknown dst_dt");
+ std::vector<Precision> exec_precisions_priority = {
+ Precision::U8,
+ Precision::I8,
+ Precision::U16,
+ Precision::I16,
+ Precision::BF16,
+ Precision::I32,
+ Precision::FP32
+ };
+
+ std::set<Precision> get_supported_precisions(MKLDNNNode& node) {
+ auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(node);
+ switch (eltwiseNode.getOpType()) {
+ case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt:
+ case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+ return jit_mkldnn_emitter::get_supported_precisions();
+ case Add: return jit_add_emitter::get_supported_precisions();
+ case MulAdd: return jit_mul_add_emitter::get_supported_precisions();
+ case Subtract: return jit_subtract_emitter::get_supported_precisions();
+ case Multiply: return jit_multiply_emitter::get_supported_precisions();
+ case Divide: return jit_divide_emitter::get_supported_precisions();
+ case FloorMod: return jit_floor_mod_emitter::get_supported_precisions();
+ case Mod: return jit_mod_emitter::get_supported_precisions();
+ case Maximum: return jit_maximum_emitter::get_supported_precisions();
+ case Minimum: return jit_minimum_emitter::get_supported_precisions();
+ case SquaredDifference: return jit_squared_difference_emitter::get_supported_precisions();
+ case PowerDynamic: return jit_power_dynamic_emitter::get_supported_precisions();
+ case Equal: return jit_equal_emitter::get_supported_precisions();
+ case NotEqual: return jit_not_equal_emitter::get_supported_precisions();
+ case Greater: return jit_greater_emitter::get_supported_precisions();
+ case GreaterEqual: return jit_greater_equal_emitter::get_supported_precisions();
+ case Less: return jit_less_emitter::get_supported_precisions();
+ case LessEqual: return jit_less_equal_emitter::get_supported_precisions();
+ case LogicalAnd: return jit_logical_and_emitter::get_supported_precisions();
+ case LogicalOr: return jit_logical_or_emitter::get_supported_precisions();
+ case LogicalXor: return jit_logical_xor_emitter::get_supported_precisions();
+ case LogicalNot: return jit_logical_not_emitter::get_supported_precisions();
+ case PowerStatic: return jit_power_static_emitter::get_supported_precisions();
+ case Prelu: return jit_prelu_emitter::get_supported_precisions();
+ default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise emitter";
+ }
+ }
+
+ std::shared_ptr<jit_emitter> create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) {
+ auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(node);
+ switch (eltwiseNode.getOpType()) {
+ case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt:
+ case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+ return std::make_shared<jit_mkldnn_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Add: return std::make_shared<jit_add_emitter>(this, isa, eltwiseNode, exec_prec);
+ case MulAdd: return std::make_shared<jit_mul_add_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Subtract: return std::make_shared<jit_subtract_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Multiply: return std::make_shared<jit_multiply_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Divide: return std::make_shared<jit_divide_emitter>(this, isa, eltwiseNode, exec_prec);
+ case FloorMod: return std::make_shared<jit_floor_mod_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Mod: return std::make_shared<jit_mod_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Maximum: return std::make_shared<jit_maximum_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Minimum: return std::make_shared<jit_minimum_emitter>(this, isa, eltwiseNode, exec_prec);
+ case SquaredDifference: return std::make_shared<jit_squared_difference_emitter>(this, isa, eltwiseNode, exec_prec);
+ case PowerDynamic: return std::make_shared<jit_power_dynamic_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Equal: return std::make_shared<jit_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+ case NotEqual: return std::make_shared<jit_not_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Greater: return std::make_shared<jit_greater_emitter>(this, isa, eltwiseNode, exec_prec);
+ case GreaterEqual: return std::make_shared<jit_greater_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Less: return std::make_shared<jit_less_emitter>(this, isa, eltwiseNode, exec_prec);
+ case LessEqual: return std::make_shared<jit_less_equal_emitter>(this, isa, eltwiseNode, exec_prec);
+ case LogicalAnd: return std::make_shared<jit_logical_and_emitter>(this, isa, eltwiseNode, exec_prec);
+ case LogicalOr: return std::make_shared<jit_logical_or_emitter>(this, isa, eltwiseNode, exec_prec);
+ case LogicalXor: return std::make_shared<jit_logical_xor_emitter>(this, isa, eltwiseNode, exec_prec);
+ case LogicalNot: return std::make_shared<jit_logical_not_emitter>(this, isa, eltwiseNode, exec_prec);
+ case PowerStatic: return std::make_shared<jit_power_static_emitter>(this, isa, eltwiseNode, exec_prec);
+ case Prelu: return std::make_shared<jit_prelu_emitter>(this, isa, eltwiseNode, exec_prec);
+ default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise emitter";
+ }
+ }
+
+ inline void compute_eltwise_op() {
+ std::vector<size_t> in_idxs;
+ std::vector<size_t> aux_idxs;
+ for (int i = 0; i < eltwise_emitter->get_inputs_num(); i++)
+ in_idxs.push_back(get_vmm_reg(i).getIdx());
+ for (int i = 0; i < eltwise_emitter->aux_vecs_count(); i++)
+ aux_idxs.push_back(get_aux_vmm(i).getIdx());
+
+ std::vector<size_t> out_idxs;
+ out_idxs.push_back(vmm_dst.getIdx());
+
+ eltwise_emitter->emit(in_idxs, out_idxs, aux_idxs);
+ }
+
+ inline void apply_post_ops(bool is_scalar, int offset = 0) {
+ int input_idx = eltwise_emitter->get_inputs_num();
+ int eltwise_post_op_idx = 0;
+ int quantization_post_op_idx = 0;
+ for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
+ if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
+ std::vector<size_t> in_idxs;
+ std::vector<size_t> aux_idxs;
+ in_idxs.push_back(vmm_dst.getIdx());
+ for (int j = 1; j < post_op_emitters[eltwise_post_op_idx]->get_inputs_num(); j++)
+ in_idxs.push_back(get_vmm_reg(input_idx++).getIdx());
+ for (int j = 0; j < post_op_emitters[eltwise_post_op_idx]->aux_vecs_count(); j++)
+ aux_idxs.push_back(get_aux_vmm(j).getIdx());
+
+ std::vector<size_t> out_idxs;
+ out_idxs.push_back(vmm_dst.getIdx());
+
+ post_op_emitters[eltwise_post_op_idx]->emit(in_idxs, out_idxs, aux_idxs);
+
+ eltwise_post_op_idx++;
+ } else {
+ auto quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(eltwiseNode.getFusedWith()[i].get());
+
+ bool do_dequantization = quantizeNode->getAlgorithm() == mkldnn::quantization_quantize_dequantize;
+ bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != eltwiseNode.getFusedWith().size() - 1;
+ int s_idx = vmm_dst.getIdx();
+
+ quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_oc_off);
+ quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1);
+
+ quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_oc_off);
+ quantization_injectors[quantization_post_op_idx]->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding,
+ is_scalar, jep_.oc_size == 1);
+
+ quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_oc_off);
+ quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1);
+
+ quantization_post_op_idx++;
+ }
}
+ }
+
+ inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc, bool broadcast) {
+ Xmm xmm_src = Xmm(vmm_src.getIdx());
+
+ if (broadcast) {
+ load_scalar(xmm_src, op, src_prc, dst_prc);
+ uni_vbroadcastss(vmm_src, xmm_src);
+ } else {
+ switch (src_prc) {
+ case Precision::FP32:
+ case Precision::I32:
+ uni_vmovups(vmm_src, op);
+ break;
+ case Precision::BF16:
+ vpmovzxwd(vmm_src, op);
+ uni_vpslld(vmm_src, vmm_src, 16);
+ break;
+ case Precision::U16:
+ uni_vpmovzxwd(vmm_src, op);
+ break;
+ case Precision::I16:
+ uni_vpmovsxwd(vmm_src, op);
+ break;
+ case Precision::I8:
+ uni_vpmovsxbd(vmm_src, op);
+ break;
+ case Precision::U8:
+ uni_vpmovzxbd(vmm_src, op);
+ break;
+ default:
+ assert(!"unknown src_prc");
+ }
- if (src_dt != data_type::f32) {
- uni_vcvtdq2ps(vmm_src, vmm_src);
+ switch (dst_prc) {
+ case Precision::FP32:
+ if (src_prc != Precision::FP32 && src_prc != Precision::BF16)
+ uni_vcvtdq2ps(vmm_src, vmm_src);
+ break;
+ case Precision::I32:
+ break;
+ default:
+ assert(!"unknown dst_prc");
+ }
}
}
- inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
- switch (src_dt) {
- case memory::f32:
- case memory::s32:
+ inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc) {
+ switch (src_prc) {
+ case Precision::FP32:
+ case Precision::I32:
movss(xmm_src, op);
break;
- case memory::s8:
+ case Precision::BF16:
+ uni_vpinsrw(xmm_src, xmm_src, op, 0);
+ uni_vpslld(xmm_src, xmm_src, 16);
+ break;
+ case Precision::I16:
+ uni_vpinsrw(xmm_src, xmm_src, op, 0);
+ uni_vpmovsxwd(xmm_src, op);
+ break;
+ case Precision::U16:
+ uni_vpinsrw(xmm_src, xmm_src, op, 0);
+ uni_vpmovzxwd(xmm_src, op);
+ break;
+ case Precision::I8:
movsx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
- case memory::u8:
+ case Precision::U8:
movzx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
default:
- assert(!"unknown dst_dt");
+ assert(!"unknown src_prc");
}
- if (src_dt != data_type::f32) {
- uni_vcvtdq2ps(xmm_src, xmm_src);
+ switch (dst_prc) {
+ case Precision::FP32:
+ if (src_prc != Precision::FP32 && src_prc != Precision::BF16)
+ uni_vcvtdq2ps(xmm_src, xmm_src);
+ break;
+ case Precision::I32:
+ break;
+ default:
+ assert(!"unknown dst_prc");
}
}
- inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) {
+ inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, Precision src_prc, Precision dst_prc) {
Xmm xmm_dst = Xmm(vmm_dst.getIdx());
Ymm ymm_dst = Ymm(vmm_dst.getIdx());
- if (dst_dt != data_type::f32) {
- uni_vcvtps2dq(vmm_dst, vmm_dst);
+ switch (src_prc) {
+ case Precision::FP32:
+ if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16)
+ uni_vcvtps2dq(vmm_dst, vmm_dst);
+ break;
+ case Precision::I32:
+ break;
+ default:
+ assert(!"unknown src_prc");
}
- switch (dst_dt) {
- case memory::f32:
- case memory::s32:
+ switch (dst_prc) {
+ case Precision::FP32:
+ case Precision::I32:
uni_vmovups(op, vmm_dst);
break;
- case memory::s8:
+ case Precision::BF16:
+ vcvtneps2bf16(ymm_dst, vmm_dst);
+ uni_vmovups(op, ymm_dst);
+ break;
+ case Precision::I16:
+ if (isa == avx512_common) {
+ vmaxps(vmm_dst, vmm_zero, vmm_dst);
+ vpmovusdw(op, vmm_dst);
+ } else {
+ uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
+ }
+ break;
+ case Precision::U16:
+ if (isa == avx512_common) {
+ vpmovsdw(op, vmm_dst);
+ } else {
+ uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
+ }
+ break;
+ case Precision::I8:
if (isa == avx512_common) {
vmaxps(vmm_dst, vmm_zero, vmm_dst);
vpmovsdb(op, vmm_dst);
movd(op, xmm_dst);
}
break;
- case memory::u8:
+ case Precision::U8:
if (isa == avx512_common) {
vpmovusdb(op, vmm_dst);
} else {
}
break;
default:
- assert(!"unknown dst_dt");
+ assert(!"unknown dst_prc");
}
}
- inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
- if (dst_dt != data_type::f32) {
- uni_vcvtps2dq(xmm_dst, xmm_dst);
+ inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, Precision src_prc, Precision dst_prc) {
+ switch (src_prc) {
+ case Precision::FP32:
+ if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16)
+ uni_vcvtps2dq(xmm_dst, xmm_dst);
+ break;
+ case Precision::I32:
+ break;
+ default:
+ assert(!"unknown src_prc");
}
- switch (dst_dt) {
- case memory::f32:
- case memory::s32:
+ switch (dst_prc) {
+ case Precision::FP32:
+ case Precision::I32:
movss(op, xmm_dst);
break;
- case memory::s8:
+ case Precision::BF16:
+ uni_vpsrld(xmm_dst, xmm_dst, 16);
+ uni_vpextrw(op, xmm_dst, 0x0);
+ break;
+ case Precision::I16:
+ uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
+ movq(reg_tmp_64, xmm_dst);
+ mov(op, reg_tmp_8);
+ break;
+ case Precision::U16:
+ uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
+ movq(reg_tmp_64, xmm_dst);
+ mov(op, reg_tmp_8);
+ break;
+ case Precision::I8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
- case memory::u8:
+ case Precision::U8:
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
default:
- assert(!"unknown dst_dt");
+ assert(!"unknown dst_prc");
}
}
};
MKLDNNEltwiseNode::MKLDNNEltwiseNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
- MKLDNNNode(layer, eng, cache), eltiwse_fq_kernel(nullptr) {
- op = EltwiseLayer::Sum;
+ MKLDNNNode(layer, eng, cache) {
}
-bool MKLDNNEltwiseNode::isSum() {
- auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName();
- return eltwiseLayer->_operation == EltwiseLayer::Sum;
-}
+InferenceEngine::details::caseless_map<std::string, std::function<void(GenericLayer*, EltwiseOpType&, mkldnn::algorithm&, float&, float&)>>
+MKLDNNEltwiseNode::initializers = {
+ {"relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
+ beta = 0.0f;
+ opType = Relu;
+ algorithm = mkldnn::eltwise_relu;
+ }},
+ {"gelu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Gelu;
+ algorithm = mkldnn::eltwise_gelu;
+ }},
+ {"elu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
+ beta = 0.0f;
+ opType = Elu;
+ algorithm = mkldnn::eltwise_elu;
+ }},
+ {"tanh", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Tanh;
+ algorithm = mkldnn::eltwise_tanh;
+ }},
+ {"sigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Logistic;
+ algorithm = mkldnn::eltwise_logistic;
+ }},
+ {"logistic", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Logistic;
+ algorithm = mkldnn::eltwise_logistic;
+ }},
+ {"square", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Square;
+ algorithm = mkldnn::eltwise_square;
+ }},
+ {"abs", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Abs;
+ algorithm = mkldnn::eltwise_abs;
+ }},
+ {"sqrt", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Sqrt;
+ algorithm = mkldnn::eltwise_sqrt;
+ }},
+ {"linear", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
+ beta = activationLayer->GetParamAsFloat("beta", 0.0f);
+ opType = Linear;
+ algorithm = mkldnn::eltwise_linear;
+ }},
+ {"bounded_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("alpha", 0.0f);
+ beta = 0.0f;
+ opType = BoundedRelu;
+ algorithm = mkldnn::eltwise_bounded_relu;
+ }},
+ {"soft_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = SoftRelu;
+ algorithm = mkldnn::eltwise_soft_relu;
+ }},
+ {"relu6", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("n", 6.0f);
+ beta = 0.0f;
+ opType = Relu6;
+ algorithm = mkldnn::eltwise_bounded_relu;
+ }},
+ {"clamp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("max", 1.0f);
+ beta = activationLayer->GetParamAsFloat("min", 0.0f);
+ opType = Clamp;
+ algorithm = mkldnn::eltwise_clamp;
+ }},
+ {"exp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Exp;
+ algorithm = mkldnn::eltwise_exp;
+ }},
+ {"not", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = LogicalNot;
+ }},
+ {"swish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
+ beta = 0.0f;
+ opType = Swish;
+ algorithm = mkldnn::eltwise_swish;
+ }},
+ {"hswish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Hswish;
+ algorithm = mkldnn::eltwise_hswish;
+ }},
+ {"mish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Mish;
+ algorithm = mkldnn::eltwise_mish;
+ }},
+ {"hsigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
+ alpha = 0.0f;
+ beta = 0.0f;
+ opType = Hsigmoid;
+ algorithm = mkldnn::eltwise_hsigmoid;
+ }},
+};
-bool MKLDNNEltwiseNode::isUnitScales() {
- auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName();
+void MKLDNNEltwiseNode::init() {
+ InferenceEngine::details::CaselessEq<std::string> comparator;
+ auto layerType = getCnnLayer().get()->type;
- if (eltwiseLayer->coeff.empty())
- return true;
+ auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
+ if (eltwiseLayer) {
+ if (!eltwiseLayer->coeff.empty())
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support input coefficients.";
+
+ switch (eltwiseLayer->_operation) {
+ case EltwiseLayer::Sum: eltwiseOp = Add; break;
+ case EltwiseLayer::Prod: eltwiseOp = Multiply; break;
+ case EltwiseLayer::Max: eltwiseOp = Maximum; break;
+ case EltwiseLayer::Sub: eltwiseOp = Subtract; break;
+ case EltwiseLayer::Min: eltwiseOp = Minimum; break;
+ case EltwiseLayer::Div: eltwiseOp = Divide; break;
+ case EltwiseLayer::Squared_diff: eltwiseOp = SquaredDifference; break;
+ case EltwiseLayer::Floor_mod: eltwiseOp = FloorMod; break;
+ case EltwiseLayer::Pow: eltwiseOp = PowerDynamic; break;
+ case EltwiseLayer::Equal: eltwiseOp = Equal; break;
+ case EltwiseLayer::Not_equal: eltwiseOp = NotEqual; break;
+ case EltwiseLayer::Greater: eltwiseOp = Greater; break;
+ case EltwiseLayer::Greater_equal: eltwiseOp = GreaterEqual; break;
+ case EltwiseLayer::Less: eltwiseOp = Less; break;
+ case EltwiseLayer::Less_equal: eltwiseOp = LessEqual; break;
+ case EltwiseLayer::Logical_AND: eltwiseOp = LogicalAnd; break;
+ case EltwiseLayer::Logical_OR: eltwiseOp = LogicalOr; break;
+ case EltwiseLayer::Logical_XOR: eltwiseOp = LogicalXor; break;
+ default: THROW_IE_EXCEPTION << "Unsupported algorithm for Eltwise node with name `" << getName() << "`.";
+ }
+ } else if (comparator(layerType, "mod")) {
+ eltwiseOp = Mod;
+ } else if (comparator(layerType, "power")) {
+ eltwiseOp = PowerStatic;
+
+ auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(getCnnLayer().get());
+ if (powerLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot convert power layer.";
+
+ alpha = powerLayer->power;
+ beta = powerLayer->scale;
+ gamma = powerLayer->offset;
+ } else if (comparator(layerType, "scaleshift")) {
+ if (getCnnLayer().get()->blobs.size() == 2) {
+ eltwiseOp = MulAdd;
+ eltwiseAlgorithm = mkldnn::depthwise_scale_shift;
+ } else {
+ eltwiseOp = Multiply;
+ }
+ } else if (comparator(layerType, "prelu")) {
+ eltwiseOp = Prelu;
+ eltwiseAlgorithm = mkldnn::depthwise_prelu;
+ } else if (comparator(layerType, "activation") && initializers.find(getCnnLayer().get()->GetParamAsString("type")) != initializers.end()) {
+ initializers[getCnnLayer().get()->GetParamAsString("type")](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta);
+ } else if (comparator(layerType, "relu") ||
+ comparator(layerType, "gelu") ||
+ comparator(layerType, "elu") ||
+ comparator(layerType, "sigmoid") ||
+ comparator(layerType, "logistic") ||
+ comparator(layerType, "tanh") ||
+ comparator(layerType, "relu6") ||
+ comparator(layerType, "exp") ||
+ comparator(layerType, "not") ||
+ comparator(layerType, "clamp") ||
+ comparator(layerType, "swish") ||
+ comparator(layerType, "hswish") ||
+ comparator(layerType, "mish") ||
+ comparator(layerType, "hsigmoid")) {
+ initializers[layerType](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta);
+ } else {
+ THROW_IE_EXCEPTION << "Unsupported algorithm for Eltwise node with name `" << getName() << "`.";
+ }
+}
- for (auto scale : eltwiseLayer->coeff) {
- if (scale != 1.0f)
- return false;
+size_t MKLDNNEltwiseNode::getOpInputsNum() const {
+ switch (getOpType()) {
+ case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt: case PowerStatic:
+ case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+ case LogicalNot:
+ return 1;
+ case Add: case Subtract: case Multiply: case Divide: case FloorMod: case Mod: case Maximum: case Minimum: case SquaredDifference:
+ case PowerDynamic: case Equal: case NotEqual: case Greater: case GreaterEqual: case Less: case LessEqual: case LogicalAnd:
+ case LogicalOr: case LogicalXor: case Prelu:
+ return 2;
+ case MulAdd:
+ return 3;
+ default: THROW_IE_EXCEPTION << "Unsupported operation for Eltwise node with name `" << getName() << "`.";
}
+}
- return true;
+bool MKLDNNEltwiseNode::isSum() {
+ return eltwiseOp == Add;
}
bool MKLDNNEltwiseNode::isWithBroadcast() {
- bool withBroadcast = false;
auto oDims = outDims[0].ToSizeVector();
for (size_t i = 0; i < inDims.size(); i++) {
auto iDims = inDims[i].ToSizeVector();
- for (size_t j = 1; j <= iDims.size(); j++) {
- if (oDims[oDims.size() - j] != iDims[iDims.size() - j]) {
- if (iDims[iDims.size() - j] == 1) {
- withBroadcast = true;
- } else {
- THROW_IE_EXCEPTION << "Incorrect dimensions for broadcasting for " << getName();
- }
- }
- if (iDims.size() < oDims.size())
- withBroadcast = true;
- }
- if (iDims.size() == 0 && oDims.size())
- withBroadcast = true;
+ if (iDims != oDims)
+ return true;
}
- return withBroadcast;
+ return false;
}
void MKLDNNEltwiseNode::getSupportedDescriptors() {
- auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
-
- if (eltwiseLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot convert eltwise layer.";
- op = eltwiseLayer->_operation;
-
- if (getParentEdges().size() < 2)
+ if (getParentEdges().size() < 1)
THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
if (getChildEdges().empty())
THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
- if (op == EltwiseLayer::Squared_diff)
- if (getParentEdges().size() != 2)
- THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName() << " for operation squared_diff.\n"
- << "Expected: 2\n" << "Actual: " << getParentEdges().size();
+}
- auto outDims = getChildEdgeAt(0)->getDims();
- for (size_t i = 0; i < getParentEdges().size(); i++) {
- auto inDims = getParentEdgeAt(i)->getDims();
- batch_dim = std::min(batch_dim, 5 - inDims.ndims());
- }
+void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
+ std::vector<Precision> supportedPrecisions = {
+ Precision::FP32,
+ Precision::U8,
+ Precision::I8,
+ Precision::U16,
+ Precision::I16,
+ Precision::BF16,
+ Precision::I32
+ };
- broadcast = isWithBroadcast();
- if (broadcast) {
- auto outDims = getChildEdgeAt(0)->getDims();
- for (size_t i = 0; i < getParentEdges().size(); i++) {
- auto inDims = getParentEdgeAt(i)->getDims();
- if (inDims.ndims() > 5 || outDims.ndims() > 5)
- THROW_IE_EXCEPTION << "Eltwise node in broadcasting mode doesn't support more than 5 dims for blobs";
+ if (!supportedPrimitiveDescriptors.empty())
+ return;
+
+ canUseOptimizedImpl = mayiuse(cpu::sse42);
+
+ size_t expectedInputsNum = getOpInputsNum();
+ for (auto& postOp : fusedWith) {
+ auto* eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode*>(postOp.get());
+ if (eltwiseNode != nullptr) {
+ expectedInputsNum += eltwiseNode->getOpInputsNum() - 1;
}
}
+ if (getParentEdges().size() > MAX_ELTWISE_INPUTS)
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support more than " << MAX_ELTWISE_INPUTS
+ << " inputs (actual = " << getParentEdges().size() << ")";
- bool with_coeffs = !eltwiseLayer->coeff.empty();
- if (op != EltwiseLayer::Sum && with_coeffs)
- THROW_IE_EXCEPTION << "Only sum operation supports operands coefficients";
+ if (expectedInputsNum != getParentEdges().size())
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input number of inputs: expected = " << expectedInputsNum
+ << " (actual = " << getParentEdges().size() << ")";
- if (with_coeffs && eltwiseLayer->coeff.size() != getParentEdges().size())
- THROW_IE_EXCEPTION << "Number of provided coefficients is not equal to number of operands";
+ std::vector<InferenceEngine::Precision> inputPrecisions;
+ for (int i = 0; i < getCnnLayer()->insData.size(); i++) {
+ inputPrecisions.push_back(getCnnLayer()->insData[i].lock()->getPrecision());
+ }
- if (with_coeffs && eltwiseLayer->precision != Precision::FP32)
- THROW_IE_EXCEPTION << "Sum with coefficients supports only FP32 precision";
+ for (auto& fusedNode : fusedWith) {
+ if (fusedNode->getType() == Eltwise) {
+ for (int i = 1; i < fusedNode->getCnnLayer()->insData.size(); i++) {
+ inputPrecisions.push_back(fusedNode->getCnnLayer()->insData[i].lock()->getPrecision());
+ }
+ }
+ }
- sum_scales.clear();
- for (int i = 0; i < getParentEdges().size(); i++)
- sum_scales.push_back(with_coeffs ? eltwiseLayer->coeff[i] : 1.0f);
-}
+ if (inputPrecisions.size() != getParentEdges().size())
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration.";
-void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
- if (!supportedPrimitiveDescriptors.empty())
- return;
+ InferenceEngine::Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();
+ if (!fusedWith.empty()) {
+ auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
+ if (lastFusedLayer) {
+ outputPrecision = lastFusedLayer->outData[0]->getPrecision();
+ }
+ }
- setPostOps(attr, true);
+ if (!mayiuse(avx512_core_bf16)) {
+ bool hasBF16 = false;
+ for (auto &inPrc : inputPrecisions)
+ if (inPrc == Precision::BF16)
+ hasBF16 = true;
- auto initDesc = [&] (mkldnn::memory::data_type inputDT, mkldnn::memory::data_type outputDT, memory::format format) -> PrimitiveDescInfo {
- InferenceEngine::LayerConfig config;
- impl_desc_type impl_type = impl_desc_type::ref;
- config.dynBatchSupport = true;
- for (size_t i = 0; i < getParentEdges().size(); i++) {
- InferenceEngine::DataConfig dataConfig;
- dataConfig.inPlace = (!i && canBeInPlace()) ? 0 : -1;
- dataConfig.constant = false;
+ if (outputPrecision == Precision::BF16 || hasBF16)
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support BF16 precision on this target.";
+ }
- if (!broadcast) {
- dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format);
- config.inConfs.push_back(dataConfig);
+ auto filterPrecision = [&](Precision& prc) {
+ if (!canUseOptimizedImpl) {
+ return Precision(Precision::FP32);
+ } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) {
+ if (prc == Precision::U32 || prc == Precision::I64 || prc == Precision::U64) {
+ return Precision(Precision::I32);
} else {
- // Broadcasting support
- if (MKLDNNMemory::IsPlainFormat(format)) {
- dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT,
- MKLDNNMemory::GetPlainFormat(getParentEdgeAt(i)->getDims()));
- config.inConfs.push_back(dataConfig);
- } else {
- // Unsupported format for broadcast mode. Should be skipped.
- // Will mark it as undef and outer code should filter it.
- impl_type = impl_desc_type::undef;
- }
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` doesn't support " << prc << " precision.";
}
+ } else {
+ return prc;
}
-
- InferenceEngine::DataConfig dataConfig;
- dataConfig.inPlace = -1;
- dataConfig.constant = false;
- dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format);
- config.outConfs.push_back(dataConfig);
- return {config, impl_type, format};
};
- if (fusedWith.empty()) {
- for (const auto& format : getAvailableFormatsForDims(getChildEdgeAt(0)->getDims())) {
- // Precision of implementation is defined by precision of output tensor
- auto prec = getCnnLayer()->outData[0]->getPrecision();
- mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(prec);
- mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(prec);
-
- // Eltwise compare operation can have the input type different from the output type
- auto node_op = this->op;
- bool is_eltwise_compare_node = ((node_op == EltwiseLayer::eOperation::Equal) ||
- (node_op == EltwiseLayer::eOperation::Not_equal) ||
- (node_op == EltwiseLayer::eOperation::Greater) ||
- (node_op == EltwiseLayer::eOperation::Greater_equal) ||
- (node_op == EltwiseLayer::eOperation::Less) ||
- (node_op == EltwiseLayer::eOperation::Less_equal));
- if (is_eltwise_compare_node) {
- auto in_prec = getCnnLayer()->insData[0].lock()->getPrecision();
- inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(in_prec);
- }
+ for (int i = 0; i < inputPrecisions.size(); i++) {
+ inputPrecisions[i] = filterPrecision(inputPrecisions[i]);
+ }
+ outputPrecision = filterPrecision(outputPrecision);
- if (inputDT == memory::bf16 || outputDT == memory::bf16) {
- inputDT = memory::f32;
- outputDT = memory::f32;
- }
+ // TODO: delete after new LPT (ngraph based) is merged
+ // WA is needed to handle bug in LPT that produces wrong precision after average pooling (I8/U8 instead of FP32)
+ if (eltwiseOp == MulAdd && (inputPrecisions[0] == Precision::U8 || inputPrecisions[0] == Precision::I8)) {
+ auto poolingLayer = dynamic_cast<PoolingLayer*>(getParentEdgesAtPort(0)[0]->getParent()->getCnnLayer().get());
+ if (poolingLayer && poolingLayer->_type == PoolingLayer::AVG) {
+ inputPrecisions[0] = Precision::FP32;
+ }
+ }
+
+ enum LayoutType {
+ Planar,
+ ChannelsFirst,
+ Blocked
+ };
- auto impl_desc = initDesc(inputDT, outputDT, format);
+ auto initDesc = [&] (LayoutType lt) -> PrimitiveDescInfo {
+ auto createMemoryDesc = [lt](MKLDNNEdgePtr edge, Precision prc, size_t offset) -> TensorDesc {
+ if (lt == ChannelsFirst) {
+ std::vector<size_t> blocks = edge->getDims().ToSizeVector();
+ std::vector<size_t> order;
+ order.push_back(0);
+ for (size_t j = 2; j < blocks.size(); j++)
+ order.push_back(j);
+ if (blocks.size() > 1)
+ order.push_back(1);
+
+ return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+ } else if (lt == Blocked && edge->getDims()[1] != 1) {
+ size_t blockSize = mayiuse(cpu::avx512_common) ? 16 : 8;
+
+ std::vector<size_t> blocks = edge->getDims().ToSizeVector();
+ std::vector<size_t> order(blocks.size());
+ for (size_t j = 0; j < order.size(); j++)
+ order[j] = j;
+
+ blocks[1] = div_up(blocks[1], blockSize);
+ blocks.push_back(blockSize);
+ order.push_back(1);
+
+ return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+ } else {
+ std::vector<size_t> blocks = edge->getDims().ToSizeVector();
+ std::vector<size_t> order(blocks.size());
+ for (size_t j = 0; j < order.size(); j++)
+ order[j] = j;
- if (impl_desc.getImplementationType() != impl_desc_type::undef) {
- supportedPrimitiveDescriptors.push_back(impl_desc);
+ return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
}
- }
- } else {
- auto ndims = getCnnLayer()->outData[0]->getDims().size();
- auto format = ndims == 2 ? memory::format::nc :
- ndims == 4 ? memory::format::nhwc :
- memory::format::ndhwc;
+ };
+ size_t offset = std::numeric_limits<size_t>::max();
InferenceEngine::LayerConfig config;
- impl_desc_type impl_type = impl_desc_type::ref;
- config.dynBatchSupport = true;
+ config.dynBatchSupport = getChildEdgeAt(0)->getDims().ndims() > 1 && getChildEdgeAt(0)->getDims() == getParentEdgeAt(0)->getDims();
+
for (size_t i = 0; i < getParentEdges().size(); i++) {
InferenceEngine::DataConfig dataConfig;
- dataConfig.inPlace = -1;
+ dataConfig.inPlace = (!i && canBeInPlace() && inputPrecisions[i] == outputPrecision) ? 0 : -1;
dataConfig.constant = false;
- auto inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(
- getCnnLayer()->insData[i].lock()->getPrecision());
- dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format);
- config.inConfs.push_back(dataConfig);
- }
- auto outputDT = memory::f32;
- auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
- if (lastFusedLayer) {
- outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(lastFusedLayer->outData[0]->getPrecision());
+
+ dataConfig.desc = createMemoryDesc(getParentEdgeAt(i), inputPrecisions[i], offset);
+
+ config.inConfs.push_back(dataConfig);
}
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
- dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format);
- config.outConfs.push_back(dataConfig);
- supportedPrimitiveDescriptors.push_back({config, impl_type, format});
+ dataConfig.desc = createMemoryDesc(getChildEdgeAt(0), outputPrecision, offset);
- jep.src0_step = config.inConfs[0].desc.getDims()[1] == 1 ? 0 : 1;
- jep.src1_step = config.inConfs[1].desc.getDims()[1] == 1 ? 0 : 1;
- jep.dst_step = 1;
- jep.src0_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.inConfs[0].desc.getPrecision());
- jep.src1_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.inConfs[1].desc.getPrecision());
- jep.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(config.outConfs[0].desc.getPrecision());
- jep.src0_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.src0_dt);
- jep.src1_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.src1_dt);
- jep.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jep.dst_dt);
- jep.eltwise_op = op;
+ config.outConfs.push_back(dataConfig);
+ impl_desc_type impl_type;
if (mayiuse(cpu::avx512_common)) {
- eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic<cpu::avx512_common>(jep, *attr.get()));
+ impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(cpu::avx2)) {
- eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic<cpu::avx2>(jep, *attr.get()));
+ impl_type = impl_desc_type::jit_avx2;
} else if (mayiuse(cpu::sse42)) {
- eltiwse_fq_kernel.reset(new jit_uni_eltwise_fq_generic<cpu::sse42>(jep, *attr.get()));
+ impl_type = impl_desc_type::jit_sse42;
+ } else {
+ impl_type = impl_desc_type::ref;
}
+
+ return {config, impl_type, MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat()};
+ };
+
+ bool isChannelsFirstApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 1, 2, 4, 5);
+ for (size_t i = 0; i < getParentEdges().size(); i++) {
+ isChannelsFirstApplicable = isChannelsFirstApplicable && one_of(getParentEdgeAt(i)->getDims().ndims(), 1, 2, 4, 5);
+ isChannelsFirstApplicable = isChannelsFirstApplicable && getChildEdgeAt(0)->getDims().ndims() == getParentEdgeAt(i)->getDims().ndims();
+ }
+
+ bool isBlockedApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 4, 5);
+ for (size_t i = 0; i < getParentEdges().size(); i++) {
+ isBlockedApplicable = isBlockedApplicable && one_of(getParentEdgeAt(i)->getDims().ndims(), 4, 5);
+ isBlockedApplicable = isBlockedApplicable && getChildEdgeAt(0)->getDims().ndims() == getParentEdgeAt(i)->getDims().ndims();
}
+
+ if (isChannelsFirstApplicable)
+ supportedPrimitiveDescriptors.emplace_back(initDesc(ChannelsFirst));
+ if (isBlockedApplicable)
+ supportedPrimitiveDescriptors.emplace_back(initDesc(Blocked));
+ supportedPrimitiveDescriptors.emplace_back(initDesc(Planar));
}
void MKLDNNEltwiseNode::createPrimitive() {
- if (prim)
- return;
+ auto config = getSelectedPrimitiveDescriptor()->getConfig();
- auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
- if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
- THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
- if (getSelectedPrimitiveDescriptor() == nullptr)
- THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
+ auto initDims = [this, config](size_t maxInputSize) {
+ size_t inputNum = getParentEdges().size();
- std::vector<memory::primitive_desc> srcs_pd;
- std::vector<primitive::at> srcs_p;
- for (size_t i = 0; i < getParentEdges().size(); i++) {
- auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
- if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) {
- auto parent = getParentEdgeAt(i)->getParent();
- THROW_IE_EXCEPTION << "Source memory from " << parent->getName() << " didn't allocate.";
+ dims_in.resize(inputNum);
+ for (int i = 0; i < inputNum; i++) {
+ dims_in[i].resize(maxInputSize, 1);
}
- if (op == EltwiseLayer::Sum) {
- srcs_pd.push_back(srcMemPtr->GetPrimitiveDescriptor());
- srcs_p.emplace_back(srcMemPtr->GetPrimitive());
+ dims_out.resize(maxInputSize, 1);
+
+ std::vector<size_t> order(maxInputSize);
+ auto outOrder = config.outConfs[0].desc.getBlockingDesc().getOrder();
+ for (size_t i = 0; i < order.size(); i++) {
+ if (i < order.size() - outOrder.size())
+ order[i] = i;
+ else
+ order[i] = outOrder[i - (order.size() - outOrder.size())] + (order.size() - outOrder.size());
}
- }
- if (op == EltwiseLayer::Sum && !broadcast && fusedWith.empty()) {
- try {
- auto primitive_desc = mkldnn::sum::primitive_desc(dstMemPtr->GetDescriptor(), sum_scales, srcs_pd);
- prim = std::shared_ptr<mkldnn::sum>(new mkldnn::sum(primitive_desc, srcs_p, dstMemPtr->GetPrimitive()));
- } catch (...) {
- std::cerr << "Handle this problem correctly!" << std::endl;
- prim = nullptr;
+
+ size_t outRank = config.outConfs[0].desc.getBlockingDesc().getBlockDims().size();
+ for (int i = 0; i < outRank; i++) {
+ dims_out[dims_out.size() - 1 - i] = config.outConfs[0].desc.getBlockingDesc().getBlockDims()[outRank - 1 - i];
}
- }
-}
-void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() {
- auto selected_pd = getSelectedPrimitiveDescriptor();
- if (selected_pd == nullptr)
- THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
- auto config = selected_pd->getConfig();
- if (isInitConfig(config))
- return;
+ for (int i = 0; i < inputNum; i++) {
+ size_t inRank = config.inConfs[i].desc.getBlockingDesc().getBlockDims().size();
- MKLDNNNode::initOptimalPrimitiveDescriptor();
+ // WA to normalize blocked and planar layouts
+ auto inOrder = config.inConfs[i].desc.getBlockingDesc().getOrder();
+ size_t startOff = outOrder.size() != config.outConfs[0].desc.getDims().size() &&
+ outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] ? 1 : 0;
- auto* selectedPD = getSelectedPrimitiveDescriptor();
- if (!selectedPD) {
- return;
- }
+ for (int j = 0; j < inRank; j++) {
+ dims_in[i][dims_in[i].size() - 1 - j - startOff] = config.inConfs[i].desc.getBlockingDesc().getBlockDims()[inRank - 1 - j];
+ }
+ }
- auto& selectedConfig = getSelectedPrimitiveDescriptor()->getConfig();
- for (size_t i = 1; i < selectedConfig.inConfs.size(); i++) {
- if (selectedConfig.inConfs[0].desc.getPrecision() != selectedConfig.inConfs[i].desc.getPrecision()) {
- selectedConfig.inConfs[i].desc.setPrecision(selectedConfig.inConfs[0].desc.getPrecision());
+ for (int i = 0; i < dims_in.size(); i++) {
+ for (int j = 0; j < dims_in[i].size(); j++) {
+ if (dims_in[i][j] != dims_out[j] && dims_in[i][j] != 1)
+ THROW_IE_EXCEPTION << "Eltwise node with name `" << getName() << "` has invalid input/output dims configuration.";
+ }
}
- }
-}
+ };
-void MKLDNNEltwiseNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {
- mkldnn::post_ops ops;
+ auto initOffsets = [this, config](size_t maxInputSize) {
+ size_t inputNum = getParentEdges().size();
- for (auto &node : fusedWith) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
+ offsets_out.resize(maxInputSize, 1);
+ offset_out_calc(offsets_out, dims_out);
+ for (int j = 0; j < maxInputSize; j++) {
+ offsets_out[j] *= config.outConfs[0].desc.getPrecision().size();
+ }
- continue;
+ offsets_in.resize(inputNum);
+ for (int i = 0; i < inputNum; i++) {
+ offsets_in[i].resize(maxInputSize, 1);
+ offset_in_calc(offsets_in[i], dims_in[i], dims_out);
+ for (int j = 0; j < maxInputSize; j++) {
+ offsets_in[i][j] *= config.inConfs[i].desc.getPrecision().size();
+ }
}
- auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
- if (quantizeNode) {
- quantizeNode->appendPostOps(ops);
- continue;
+ start_offset_in.resize(inputNum);
+ for (size_t i = 0; i < inputNum; i++) {
+ start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
+ MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getParentEdgeAt(i)->getMemory().GetDescriptor().data.data_type));
}
+ start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
+ MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getChildEdgeAt(0)->getMemory().GetDescriptor().data.data_type));
+ };
- THROW_IE_EXCEPTION << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
- }
+ auto collapseLastDims = [](std::vector<size_t>& dims, int dimsToCollapse) {
+ for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
+ dims[dims.size() - 1] *= dims[i];
+ }
- attr.set_post_ops(ops);
-}
+ for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
+ dims[i] = dims[i - dimsToCollapse];
+ }
-void MKLDNNEltwiseNode::dims_calc(int *dims, const MKLDNNDims &edge_dims, bool channels_first = false) {
- for (int i = 0; i < 5; i++)
- dims[i] = 1;
- int ndims = edge_dims.ndims();
- if (ndims > 5) {
- THROW_IE_EXCEPTION << "ndims should be less then 5";
- }
- for (int i = 0; i < ndims; i++) {
- dims[4 - i] = edge_dims[ndims - 1 - i];
- }
- if (edge_dims.ndims() && !(broadcast && edge_dims[0] == getChildEdgeAt(0)->getDims()[0]))
- dims[batch_dim] = std::min(dims[batch_dim], batchToProcess());
-
- if (channels_first) {
- auto ch_idx = 5 - ndims + 1;
- auto ch = dims[ch_idx];
- for (int i = ch_idx; i < 4; i++) {
- dims[i] = dims[i + 1];
+ for (int i = dimsToCollapse - 1; i >= 0; i--) {
+ dims[i] = 1;
}
- dims[4] = ch;
- }
-}
+ };
-void MKLDNNEltwiseNode::offset_out_calc(int *offset, int *dims) {
- int k = 1;
- for (int i = 4; i >= 0; i--) {
- offset[i] = k;
- k *= dims[i];
- }
-}
+ auto collapseLastOffsets = [](std::vector<size_t>& dims, int dimsToCollapse) {
+ for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
+ if (dims[dims.size() - 1] > 0 || dims[i] > 0)
+ dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast<size_t>(1)) * std::max(dims[i], static_cast<size_t>(1));
+ else
+ dims[dims.size() - 1] *= dims[i];
+ }
-void MKLDNNEltwiseNode::offset_in_calc(int *offset, int *dims_in, int *dims_out) {
- int k = 1;
- for (int i = 4; i >= 0; i--) {
- offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
- k *= dims_in[i];
- }
-}
+ for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
+ dims[i] = dims[i - dimsToCollapse];
+ }
-// Intel C++ Compiler 18.0 for Windows contains bug that doesn't allow to use templates to generate eltwise implementations
-// and to avoid all copypaste below
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_add(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] + src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] + src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in];
- }
- });
-#endif
+ for (int i = dimsToCollapse - 1; i >= 0; i--) {
+ dims[i] = 0;
}
- }
-}
+ };
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_prod(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] * src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] * src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] * src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] * src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in];
- }
- }
- }
- }
- }
-#else
- parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in];
- });
-#endif
- }
- }
-}
+ tensorRank = std::max(static_cast<size_t>(optimalTensorRank), config.outConfs[0].desc.getBlockingDesc().getBlockDims().size());
+ initDims(tensorRank);
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_max(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]);
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]);
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
- }
- }
+ auto outOrder = config.outConfs[0].desc.getBlockingDesc().getOrder();
+ size_t oc_size = 0;
+ offsets_oc.resize(tensorRank, 0);
+ if (isFusedWith(Quantize)) {
+ size_t offset_oc = 1;
+ for (int i = outOrder.size() - 1; i >= 0; i--) {
+ if (outOrder[i] == 1) {
+ int oc_dim_idx = i + (tensorRank - outOrder.size());
+ offsets_oc[oc_dim_idx] = offset_oc;
+ offset_oc *= dims_out[oc_dim_idx];
}
}
+ oc_size = offsets_oc[dims_out.size() - 1] != 0 ? dims_out[dims_out.size() - 1] : 1;
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]);
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]);
- }
- });
-#endif
- }
- }
-}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_sub(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] - src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] - src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] - src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] - src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1];
- }
- }
- }
- }
+ fullWorkAmount = 1;
+ for (int i = 0; i < dims_out.size(); i++) {
+ fullWorkAmount *= dims_out[i];
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in];
- }
- }
+
+ size_t minimalConcurrency = parallel_get_max_threads();
+ size_t minimalJitWorkAmount = 256;
+ size_t currentJitWorkAmount = dims_out[dims_out.size() - 1];
+ int collapsedDims = 0;
+ if (canUseOptimizedImpl) {
+ bool hasDifferentDims = false;
+ while (currentJitWorkAmount < minimalJitWorkAmount) {
+ if (dims_out.size() - collapsedDims - 2 < 0)
+ break;
+
+ for (int j = 1; j < dims_in.size(); j++) {
+ if (dims_in[j][dims_in[j].size() - 1] != dims_in[0][dims_in[0].size() - 1]) {
+ hasDifferentDims = true;
}
}
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in];
- }
- });
-#endif
- }
- }
-}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_min(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]);
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]);
+ if (oc_size > 1 && oc_size != dims_in[0][dims_in[0].size() - 1]) {
+ hasDifferentDims = true;
}
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]);
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
+
+ bool canCollapse = true;
+ for (int i = 0; i < dims_in.size(); i++) {
+ if (dims_in[i][dims_in[i].size() - 2] != 1) {
+ if (dims_in[i][dims_in[i].size() - 1] == 1) {
+ canCollapse = false;
+ break;
}
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]);
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]);
- }
+
+ if (hasDifferentDims) {
+ canCollapse = false;
+ break;
}
}
}
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]);
- }
- });
-#endif
- }
- }
-}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_div(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] / src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] / src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] / src_ptr[i];
+ if (!canCollapse) {
+ break;
}
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] / src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1];
- }
+
+ size_t nextJitWorkAmount = currentJitWorkAmount * dims_out[dims_out.size() - 2];
+ if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
+ currentJitWorkAmount = nextJitWorkAmount;
+ collapsedDims++;
+
+ for (int i = 0; i < dims_in.size(); i++) {
+ collapseLastDims(dims_in[i], 1);
}
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in];
- }
- }
+ collapseLastDims(dims_out, 1);
+
+ if (isFusedWith(Quantize)) {
+ collapseLastOffsets(offsets_oc, 1);
}
+ } else {
+ break;
}
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in];
- }
- });
-#endif
- }
}
-}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_squared_diff(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]);
+ isDynBatchEnabled = config.dynBatchSupport;
+ batchDimIdx = tensorRank - config.outConfs[0].desc.getBlockingDesc().getBlockDims().size() + collapsedDims;
+ schedulerWorkAmount = fullWorkAmount / dims_out[dims_out.size() - 1];
+
+ initOffsets(tensorRank);
+
+ jep.inputs_number = config.inConfs.size();
+ jep.input_size = tensorRank;
+
+ for (int i = 0; i < config.inConfs.size(); i++) {
+ jep.src_size[i] = dims_in[i][dims_in[i].size() - 1];
+ jep.src_prc[i] = config.inConfs[i].desc.getPrecision();
}
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]);
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]);
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]);
- }
- }
- }
- }
+ jep.dst_size = dims_out[dims_out.size() - 1];
+ jep.dst_prc = config.outConfs[0].desc.getPrecision();
+
+ for (int i = 0; i < config.inConfs.size(); i++) {
+ jep.src_offsets[i] = offsets_in[i];
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]);
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]);
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]);
- }
- });
-#endif
- }
+ jep.dst_offsets = offsets_out;
+
+ jep.oc_size = oc_size;
+
+ if (mayiuse(cpu::avx512_common)) {
+ eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx512_common>(jep, *this));
+ } else if (mayiuse(cpu::avx2)) {
+ eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx2>(jep, *this));
+ } else if (mayiuse(cpu::sse42)) {
+ eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::sse42>(jep, *this));
}
}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_floor_mod(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in0] / src1_ptr[index_in1] * src1_ptr[index_in1];
+void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() {
+ for (auto& type : getPrimitivesPriority()) {
+ int selectedPrimitive = -1;
+ int equalsFormatCount = -1;
+ for (size_t i = 0; i < getSupportedPrimitiveDescriptors().size(); i++) {
+ impl_desc_type supportedType = getSupportedPrimitiveDescriptors()[i].getImplementationType();
+ if (type == supportedType) {
+ int equalsLocalFormatCount = 0;
+ if (getSupportedPrimitiveDescriptors()[i].getConfig().inConfs.size() > getParentEdges().size())
+ continue;
+ for (size_t j = 0; j < getSupportedPrimitiveDescriptors()[i].getConfig().inConfs.size(); j++) {
+ auto parentEdge = getParentEdgeAt(j);
+ auto parentPtr = parentEdge->getParent();
+ // We don't take into account constant edges since reorders on them will be executed on load network stage
+ if (j > 0 && parentPtr->isConstant()) {
+ equalsLocalFormatCount++;
+ continue;
}
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in0] / src1_ptr[index_in1] * src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_out] / src_ptr[index_in] * src_ptr[index_in];
+
+ auto parent_spd = parentPtr->getSelectedPrimitiveDescriptor();
+
+ if (parent_spd != nullptr && !parent_spd->getConfig().outConfs.empty()) {
+ int inNum = parentEdge->getInputNum();
+ if (inNum < 0 || inNum >= parent_spd->getConfig().outConfs.size()) {
+ inNum = 0;
+ }
+ if (MKLDNNExtensionUtils::initTensorsAreEqual(
+ getSupportedPrimitiveDescriptors()[i].getConfig().inConfs[j].desc,
+ parent_spd->getConfig().outConfs[inNum].desc)) {
+ equalsLocalFormatCount++;
}
}
}
+ if (equalsLocalFormatCount > equalsFormatCount) {
+ equalsFormatCount = equalsLocalFormatCount;
+ selectedPrimitive = static_cast<int>(i);
+ }
}
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_out] / src_ptr[index_in] * src_ptr[index_in];
- }
- });
-#endif
+ if (selectedPrimitive >= 0) {
+ selectPrimitiveDescriptorByIndex(selectedPrimitive);
+ return;
}
}
+
+ if (getSupportedPrimitiveDescriptors().empty())
+ THROW_IE_EXCEPTION << "Supported primitive descriptors list is empty for node: " << getName();
+ // fallback. If there are no primitives from priority list just select a first
+ selectPrimitiveDescriptorByIndex(0);
}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_pow(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]);
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]);
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]);
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]);
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]);
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]);
- }
- });
-#endif
- }
+void MKLDNNEltwiseNode::offset_out_calc(std::vector<size_t>& offset, std::vector<size_t>& dims) {
+ int k = 1;
+ for (int i = offset.size() - 1; i >= 0; i--) {
+ offset[i] = k;
+ k *= dims[i];
}
}
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_equal(
- const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] == src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] == src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] == src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] == src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in];
- }
- });
-#endif
- }
+void MKLDNNEltwiseNode::offset_in_calc(std::vector<size_t>& offset, std::vector<size_t>& dims_in, std::vector<size_t>& dims_out) {
+ int k = 1;
+ for (int i = offset.size() - 1; i >= 0; i--) {
+ offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
+ k *= dims_in[i];
}
}
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_not_equal(
- const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] != src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] != src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] != src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] != src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1];
- }
- }
+void MKLDNNEltwiseNode::executeOptimized6D(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr) {
+ size_t inputNum = src_ptrs.size();
+
+ parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4],
+ [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) {
+ // TODO: reimplement initializer via jit approach
+ size_t index_in[MAX_ELTWISE_INPUTS] = {0};
+ for (int i = 0; i < inputNum; i++) {
+ index_in[i] = i0 * offsets_in[i][0] + i1 * offsets_in[i][1] + i2 * offsets_in[i][2] +
+ i3 * offsets_in[i][3] + i4 * offsets_in[i][4];
}
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1];
+ size_t index_out = i0 * offsets_out[0] + i1 * offsets_out[1] + i2 * offsets_out[2] +
+ i3 * offsets_out[3] + i4 * offsets_out[4];
+
+ auto arg = jit_eltwise_call_args();
+ for (int i = 0; i < inputNum; i++) {
+ arg.src_ptr[i] = src_ptrs[i] + index_in[i];
}
+ arg.dst = dst_ptr + index_out;
+ arg.work_amount = static_cast<size_t>(dims_out[dims_out.size() - 1]);
+ arg.oc_off = (i0 * offsets_oc[0] + i1 * offsets_oc[1] + i2 * offsets_oc[2] +
+ i3 * offsets_oc[3] + i4 * offsets_oc[4]) * sizeof(float);
+
+ (*eltwise_kernel)(&arg);
});
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in];
- }
- });
-#endif
- }
- }
}
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_less(
- const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] < src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] < src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] < src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] < src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1];
+void MKLDNNEltwiseNode::executeOptimizedGeneric(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr) {
+ size_t inputNum = src_ptrs.size();
+
+ parallel_nt(0, [&](const int ithr, const int nthr) {
+ size_t start = 0, end = 0;
+ splitter(schedulerWorkAmount, nthr, ithr, start, end);
+
+ std::vector<size_t> counters(dims_out.size() - 1, 0);
+
+ for (size_t iwork = start; iwork < end; ++iwork) {
+ size_t tmp = iwork;
+ for (ptrdiff_t j = dims_out.size() - 2; j >= 0; j--) {
+ counters[j] = tmp % dims_out[j];
+ tmp /= dims_out[j];
}
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in];
- }
- }
+
+ size_t index_in[MAX_ELTWISE_INPUTS] = {0};
+ for (int i = 0; i < inputNum; i++) {
+ index_in[i] = 0;
+ for (int j = 0; j < counters.size(); j++) {
+ index_in[i] += counters[j] * offsets_in[i][j];
}
}
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in];
- }
- });
-#endif
- }
- }
-}
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_less_equal(
- const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] <= src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] <= src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] <= src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] <= src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1];
- }
- }
+ size_t index_out = 0;
+ for (int j = 0; j < counters.size(); j++) {
+ index_out += counters[j] * offsets_out[j];
}
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1];
+
+ auto arg = jit_eltwise_call_args();
+ for (int i = 0; i < inputNum; i++) {
+ arg.src_ptr[i] = src_ptrs[i] + index_in[i];
}
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in];
- }
- }
- }
+ arg.dst = dst_ptr + index_out;
+ arg.work_amount = static_cast<size_t>(dims_out[dims_out.size() - 1]);
+
+ arg.oc_off = 0;
+ for (int j = 0; j < counters.size(); j++) {
+ arg.oc_off += counters[j] * offsets_oc[j] * sizeof(float);
}
+
+ (*eltwise_kernel)(&arg);
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in];
- }
- });
-#endif
- }
- }
+ });
}
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_greater(
- const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] > src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] > src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] > src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] > src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1];
- }
- }
- }
- }
+void MKLDNNEltwiseNode::executeReference(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr) {
+ size_t inputNum = src_ptrs.size();
+
+ std::shared_ptr<ref_eltwise_scalar_fwd_t> ref_eltwise_injector = nullptr;
+ if (eltwiseAlgorithm != mkldnn::algorithm_undef) {
+ ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(static_cast<mkldnn_alg_kind_t>(eltwiseAlgorithm), alpha, beta);
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1];
+
+ parallel_nt(0, [&](const int ithr, const int nthr) {
+ size_t start = 0, end = 0;
+ splitter(fullWorkAmount, nthr, ithr, start, end);
+
+ std::vector<size_t> counters(dims_out.size(), 0);
+
+ for (size_t iwork = start; iwork < end; ++iwork) {
+ size_t tmp = iwork;
+ for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) {
+ counters[j] = tmp % dims_out[j];
+ tmp /= dims_out[j];
}
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in];
- }
- }
+
+ size_t index_in[MAX_ELTWISE_INPUTS] = {0};
+ for (int i = 0; i < inputNum; i++) {
+ index_in[i] = 0;
+ for (int j = 0; j < counters.size(); j++) {
+ index_in[i] += counters[j] * offsets_in[i][j];
}
}
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in];
- }
- });
-#endif
- }
- }
+
+ size_t index_out = 0;
+ for (int j = 0; j < counters.size(); j++) {
+ index_out += counters[j] * offsets_out[j];
+ }
+
+ std::vector<float> src_f(inputNum);
+ for (int i = 0; i < inputNum; i++) {
+ src_f[i] = reinterpret_cast<const float *>(src_ptrs[i] + index_in[i])[0];
+ }
+ float* dst_ptr_f = reinterpret_cast<float *>(dst_ptr + index_out);
+
+ switch (getOpType()) {
+ case Relu: case Gelu: case Elu: case Tanh: case Logistic: case Square: case Abs: case Sqrt:
+ case Linear: case BoundedRelu: case SoftRelu: case Relu6: case Exp: case Clamp: case Swish: case Hswish: case Mish: case Hsigmoid:
+ *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); break;
+ case Add: *dst_ptr_f = src_f[0] + src_f[1]; break;
+ case MulAdd: *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break;
+ case Subtract: *dst_ptr_f = src_f[0] - src_f[1]; break;
+ case Multiply: *dst_ptr_f = src_f[0] * src_f[1]; break;
+ case Divide: *dst_ptr_f = src_f[0] / src_f[1]; break;
+ case FloorMod: *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break;
+ case Mod: *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break;
+ case Maximum: *dst_ptr_f = std::max(src_f[0], src_f[1]); break;
+ case Minimum: *dst_ptr_f = std::min(src_f[0], src_f[1]); break;
+ case SquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break;
+ case PowerDynamic: *dst_ptr_f = powf(src_f[0], src_f[1]); break;
+ case Equal: *dst_ptr_f = src_f[0] == src_f[1]; break;
+ case NotEqual: *dst_ptr_f = src_f[0] != src_f[1]; break;
+ case Greater: *dst_ptr_f = src_f[0] > src_f[1]; break;
+ case GreaterEqual: *dst_ptr_f = src_f[0] >= src_f[1]; break;
+ case Less: *dst_ptr_f = src_f[0] < src_f[1]; break;
+ case LessEqual: *dst_ptr_f = src_f[0] <= src_f[1]; break;
+ case LogicalAnd: *dst_ptr_f = src_f[0] && src_f[1]; break;
+ case LogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break;
+ case LogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break;
+ case LogicalNot: *dst_ptr_f = !src_f[0]; break;
+ case PowerStatic: *dst_ptr_f = powf(beta * src_f[0] + gamma, alpha); break;
+ case Prelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : src_f[0] * src_f[1]; break;
+ default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node with name `" << getName() << "`";
+ }
+ }
+ });
}
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::eltwise_greater_equal(
- const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] >= src1_ptr[i];
+void MKLDNNEltwiseNode::execute(mkldnn::stream strm) {
+ size_t inputNum = getParentEdges().size();
+
+ std::vector<const uint8_t *> src_ptrs(inputNum);
+ for (int i = 0; i < inputNum; i++) {
+ src_ptrs[i] = reinterpret_cast<const uint8_t*>(getParentEdgeAt(i)->getMemory().GetData()) + start_offset_in[i];
}
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] >= src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] >= src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] >= src_ptr[i];
- });
-#endif
+ uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) + start_offset_out;
+
+ // In general case we need to recompute offsets as well but currently all supported layout assumes batch to be outermost dimension
+ if (isDynBatchEnabled)
+ dims_out[batchDimIdx] = static_cast<size_t>(batchToProcess());
+
+ if (eltwise_kernel) {
+ if (tensorRank == optimalTensorRank) {
+ executeOptimized6D(src_ptrs, dst_ptr);
+ } else {
+ executeOptimizedGeneric(src_ptrs, dst_ptr);
}
} else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in];
- }
- });
-#endif
- }
+ executeReference(src_ptrs, dst_ptr);
}
}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_logical_and(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] && src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] && src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] && src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] && src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in];
- }
- }
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in];
- }
- });
-#endif
- }
- }
+bool MKLDNNEltwiseNode::created() const {
+ return getType() == Eltwise;
}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_logical_or(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = src0_ptr[i] || src1_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = src0_ptr[i] || src1_ptr[i];
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = dst_ptr[i] || src_ptr[i];
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = dst_ptr[i] || src_ptr[i];
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1];
- }
- }
- }
- }
+bool MKLDNNEltwiseNode::canBeInPlace() const {
+ if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Input) {
+ return false;
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1];
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in];
- }
- }
- }
+
+ for (auto& parentEdge : getParentEdges()) {
+ auto parent = parentEdge.lock()->getParent();
+ if (parent->getChildEdges().size() != 1)
+ return false;
+
+ // WA to prevent memory corruption caused by inplace feature
+ if (parent->getType() == Concatenation) {
+ for (auto& parentParentEdge : parent->getParentEdges()) {
+ auto parentParent = parentParentEdge.lock()->getParent();
+ if (parentParent->getChildEdges().size() != 1)
+ return false;
}
}
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in];
- }
- });
-#endif
- }
}
+
+ return getParentEdgesAtPort(0)[0].get()->getDims() == getChildEdgesAtPort(0)[0].get()->getDims();
}
-template <typename T0, typename T1> void MKLDNNEltwiseNode::eltwise_logical_xor(
- const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) {
- if (!broadcast) {
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]);
- });
-#endif
- for (int j = 2; j < getParentEdges().size(); j++) {
- const T1 *src_ptr = reinterpret_cast<const T1*>(getParentEdgeAt(j)->getMemory().GetData()) +
- getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-#ifdef _WIN32
- for (size_t i = 0; i < dst_data_size; i++) {
- dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]);
- }
-#else
- parallel_for(dst_data_size, [&](size_t i) {
- dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]);
- });
-#endif
- }
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims);
- dims_calc(dims_in0, parent0_edge_dims);
- dims_calc(dims_in1, parent1_edge_dims);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]);
- }
+void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) {
+ switch (getAlgorithm()) {
+ case mkldnn::eltwise_relu:
+ case mkldnn::eltwise_tanh:
+ case mkldnn::eltwise_elu:
+ case mkldnn::eltwise_square:
+ case mkldnn::eltwise_abs:
+ case mkldnn::eltwise_sqrt:
+ case mkldnn::eltwise_linear:
+ case mkldnn::eltwise_bounded_relu:
+ case mkldnn::eltwise_soft_relu:
+ case mkldnn::eltwise_logistic:
+ case mkldnn::eltwise_exp:
+ case mkldnn::eltwise_gelu:
+ case mkldnn::eltwise_clamp:
+ case mkldnn::eltwise_swish:
+ case mkldnn::eltwise_hswish:
+ case mkldnn::eltwise_mish:
+ case mkldnn::eltwise_hsigmoid:
+ ops.append_eltwise(1.0, getAlgorithm(), getAlpha(), getBeta());
+ break;
+ case mkldnn::depthwise_scale_shift:
+ case mkldnn::depthwise_prelu:
+ if (scales.empty() && shifts.empty()) {
+ size_t bufferSize = static_cast<size_t>(outDims[0][outDims[0].size() > 1 ? 1 : 0]);
+ size_t bufferSizeAligned = rnd_up(bufferSize, 16);
+
+ Blob::Ptr scalesBlob = getCnnLayer()->blobs["weights"];
+ if (scalesBlob == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get weights blob in Eltwise node with name `" << getName() << "`";
+ scales.resize(bufferSizeAligned, 0);
+ const float *scalesBufferPtr = scalesBlob->buffer().as<float *>();
+ for (int i = 0; i < bufferSize; i++) {
+ scales[i] = scalesBufferPtr[scalesBlob->size() == 1 ? 0 : i];
}
- }
- }
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]);
- }
- });
-#endif
- for (size_t n = 2; n < getParentEdges().size(); n++) {
- const T1 *src_ptr = reinterpret_cast<const T1 *>(getParentEdgeAt(n)->getMemory().GetData()) +
- getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- auto& parent_edge_dims = getParentEdgeAt(n)->getDims();
- dims_calc(dims_in1, parent_edge_dims);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
-#ifdef _WIN32
- for (size_t i0 = 0; i0 < dims_out[0]; i0++) {
- for (size_t i1 = 0; i1 < dims_out[1]; i1++) {
- for (size_t i2 = 0; i2 < dims_out[2]; i2++) {
- for (size_t i3 = 0; i3 < dims_out[3]; i3++) {
- for (size_t i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]);
- }
+
+ Blob::Ptr shiftsBlob = getCnnLayer()->blobs["biases"];
+ if (shiftsBlob != nullptr) {
+ shifts.resize(bufferSizeAligned, 0);
+ const float *shiftsBufferPtr = shiftsBlob->buffer().as<float *>();
+ for (int i = 0; i < bufferSize; i++) {
+ shifts[i] = shiftsBufferPtr[shiftsBlob->size() == 1 ? 0 : i];
}
}
}
- }
-#else
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- for (int i4 = 0; i4 < dims_out[4]; i4++) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4];
- size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4];
- dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]);
- }
- });
-#endif
- }
- }
-}
-
-template <typename T0, typename T1, typename T2> void MKLDNNEltwiseNode::ref_eltwise2(int in0, int in1) {
- IE_ASSERT(getParentEdges().size() > 1);
-
- auto& srcMemory0 = getParentEdgeAt(in0)->getMemory();
- auto& srcMemory1 = getParentEdgeAt(in1)->getMemory();
- const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetData()) +
- srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
- const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData()) +
- srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
- T2 *dst_ptr = reinterpret_cast<T2*>(getChildEdgeAt(0)->getMemory().GetData()) +
- getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess();
-
- switch (op) {
- case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
- }
-}
-
-template <typename T0, typename T1> void MKLDNNEltwiseNode::ref_eltwise(int in0, int in1) {
- IE_ASSERT(getParentEdges().size() > 1);
-
- auto& srcMemory0 = getParentEdgeAt(in0)->getMemory();
- auto& srcMemory1 = getParentEdgeAt(in1)->getMemory();
- const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetData()) +
- srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
- const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData()) +
- srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
- T0 *dst_ptr = reinterpret_cast<T0*>(getChildEdgeAt(0)->getMemory().GetData()) +
- getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess();
-
- switch (op) {
- case EltwiseLayer::eOperation::Sum: eltwise_add(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Prod: eltwise_prod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Max: eltwise_max(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Sub: eltwise_sub(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Min: eltwise_min(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Div: eltwise_div(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Squared_diff: eltwise_squared_diff(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Floor_mod: eltwise_floor_mod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Pow: eltwise_pow(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Logical_AND: eltwise_logical_and(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Logical_OR: eltwise_logical_or(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- case EltwiseLayer::eOperation::Logical_XOR: eltwise_logical_xor(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break;
- default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node";
- }
-}
-void MKLDNNEltwiseNode::jit_eltwise_fq() {
- auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
- auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
- auto& dstMemory = getChildEdgeAt(0)->getMemory();
-
- const uint8_t *src0_ptr = reinterpret_cast<const uint8_t*>(srcMemory0.GetData()) +
- srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding *
- MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemory0.GetDescriptor().data.data_type));
- const uint8_t *src1_ptr = reinterpret_cast<const uint8_t*>(srcMemory1.GetData()) +
- srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding *
- MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemory1.GetDescriptor().data.data_type));
- uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(dstMemory.GetData()) +
- dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding *
- MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemory.GetDescriptor().data.data_type));
-
- if (!broadcast) {
- auto& dims = getParentEdgeAt(0)->getDims();
-
- int N = batchToProcess();
- int C = dims[1];
- int D = dims.ndims() > 4 ? dims[2] : 1;
- int H = dims.ndims() > 2 ? dims[dims.ndims() - 2] : 1;
- int W = dims.ndims() > 3 ? dims[dims.ndims() - 1] : 1;
-
- parallel_for4d(N, D, H, W, [&](int n, int d, int h, int w) {
- size_t off = n * D * H * W * C + d * H * W * C + h * W * C + w * C;
-
- auto arg = jit_eltwise_fq_call_args();
- arg.src0 = src0_ptr + off * jep.src0_data_size;
- arg.src1 = src1_ptr + off * jep.src1_data_size;
- arg.dst = dst_ptr + off * jep.dst_data_size;
- arg.work_amount = static_cast<size_t>(C);
-
- (*eltiwse_fq_kernel)(&arg);
- });
- } else {
- int dims_out[5], dims_in0[5], dims_in1[5];
- int offset_out[5], offset_in0[5], offset_in1[5];
- auto& child_edge_dims = getChildEdgeAt(0)->getDims();
- auto& parent0_edge_dims = getParentEdgeAt(0)->getDims();
- auto& parent1_edge_dims = getParentEdgeAt(1)->getDims();
- dims_calc(dims_out, child_edge_dims, true);
- dims_calc(dims_in0, parent0_edge_dims, true);
- dims_calc(dims_in1, parent1_edge_dims, true);
- offset_out_calc(offset_out, dims_out);
- offset_in_calc(offset_in0, dims_in0, dims_out);
- offset_in_calc(offset_in1, dims_in1, dims_out);
-
- parallel_for4d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], [&](size_t i0, size_t i1, size_t i2, size_t i3) {
- size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3];
- size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3];
- size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3];
-
- auto arg = jit_eltwise_fq_call_args();
- arg.src0 = src0_ptr + index_in0 * jep.src0_data_size;
- arg.src1 = src1_ptr + index_in1 * jep.src1_data_size;
- arg.dst = dst_ptr + index_out * jep.dst_data_size;
- arg.work_amount = static_cast<size_t>(dims_out[4]);
-
- (*eltiwse_fq_kernel)(&arg);
- });
+ ops.append_depthwise(getAlgorithm(), &scales[0], shifts.empty() ? nullptr : &shifts[0]);
+ break;
+ default: THROW_IE_EXCEPTION << "Appending Eltwise node with name `" << getName() << "` as post operation is not supported";
}
}
-void MKLDNNEltwiseNode::execute(mkldnn::stream strm) {
- if (prim) {
- MKLDNNNode::execute(strm);
- } else {
- if (op == EltwiseLayer::Floor_mod) {
- for (size_t i = 0; i < getParentEdges().size(); i++)
- if (getParentEdgeAt(i)->getDesc().getPrecision() != Precision::I32)
- THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of inputs";
- if (getChildEdgeAt(0)->getDesc().getPrecision() != Precision::I32)
- THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of output";
- }
-
- if (getParentEdges().size() > 2) {
- Precision pi = getParentEdgeAt(0)->getDesc().getPrecision();
- Precision po = getChildEdgeAt(0)->getDesc().getPrecision();
- for (int i = 1; i < getParentEdges().size(); i++) {
- if (getParentEdgeAt(i)->getDesc().getPrecision() != pi)
- THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs must have same precision";
+bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
+ auto isOneOf = [](EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
+ for (auto a : algs) {
+ if (alg == a) {
+ return true;
}
- if (pi != po) {
- THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs and output must have same precision";
- }
- if (pi == Precision::FP32)
- ref_eltwise<float, float>(0, 1);
- else if (pi == Precision::I32)
- ref_eltwise<int32_t, int32_t>(0, 1);
- else if (pi == Precision::I8)
- ref_eltwise<int8_t, int8_t>(0, 1);
- else if (pi == Precision::U8)
- ref_eltwise<uint8_t, uint8_t>(0, 1);
- else
- THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, only FP32, I32, I8, U8 are supported";
- return;
}
+ return false;
+ };
- Precision pi0 = getParentEdgeAt(0)->getDesc().getPrecision();
- Precision pi1 = getParentEdgeAt(1)->getDesc().getPrecision();
- Precision po = getChildEdgeAt(0)->getDesc().getPrecision();
+ if (!mayiuse(cpu::sse42))
+ return false;
- IE_ASSERT(getParentEdges().size() > 1);
+ // FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number
+ size_t addedInputEdgesNum = node->getType() != Quantize ? (node->getParentEdges().size() - 1) : 0;
+ if (getParentEdges().size() + addedInputEdgesNum > MAX_ELTWISE_INPUTS)
+ return false;
- if (!fusedWith.empty()) {
- jit_eltwise_fq();
- } else {
- // Input and output types for eltwise compare operations can be different
- bool is_eltwise_compare_node = (op == EltwiseLayer::Equal || op == EltwiseLayer::Not_equal ||
- op == EltwiseLayer::Greater || op == EltwiseLayer::Greater_equal ||
- op == EltwiseLayer::Less || op == EltwiseLayer::Less_equal);
-
- if (po == Precision::FP32 && pi0 == po && pi1 == po) {
- ref_eltwise<float, float>(0, 1);
- } else if (po == Precision::FP32 && pi0 == po && pi1 == Precision::I8) {
- ref_eltwise<float, int8_t>(0, 1);
- } else if (po == Precision::FP32 && pi1 == po && pi0 == Precision::I8) {
- ref_eltwise<float, int8_t>(1, 0);
- } else if (po == Precision::FP32 && pi0 == po && pi1 == Precision::U8) {
- ref_eltwise<float, uint8_t>(0, 1);
- } else if (po == Precision::FP32 && pi1 == po && pi0 == Precision::U8) {
- ref_eltwise<float, uint8_t>(1, 0);
- } else if (po == Precision::I8 && pi0 == po && pi1 == po) {
- ref_eltwise<int8_t, int8_t>(0, 1);
- } else if (po == Precision::I8 && pi0 == po && pi1 == Precision::U8) {
- ref_eltwise<int8_t, uint8_t>(0, 1);
- } else if (po == Precision::I8 && pi1 == po && pi0 == Precision::U8) {
- ref_eltwise<int8_t, uint8_t>(1, 0);
- } else if (po == Precision::I32 && pi0 == po && pi1 == po) {
- ref_eltwise<int32_t, int32_t>(0, 1);
- } else if (po == Precision::U8 && pi0 == Precision::I32 && pi0 == pi1 && is_eltwise_compare_node) {
- ref_eltwise2<int32_t, int32_t, uint8_t>(0, 1);
- } else if (po == Precision::U8 && pi0 == Precision::FP32 && pi0 == pi1 && is_eltwise_compare_node) {
- ref_eltwise2<float, float, uint8_t>(0, 1);
- } else {
- THROW_IE_EXCEPTION << "Eltwise node with unsupported combination of input and output types";
+ if (node->getType() == Eltwise) {
+ auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
+ if (eltwiseNode->getParentEdgesAtPort(0)[0]->getParent().get() != this) {
+ // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port.
+ if (isOneOf(eltwiseNode->getOpType(), {Subtract, Divide, FloorMod, Mod, PowerDynamic, Greater, GreaterEqual, Less, LessEqual})) {
+ return false;
}
- }
- }
-}
-
-bool MKLDNNEltwiseNode::created() const {
- return getType() == Eltwise;
-}
-bool MKLDNNEltwiseNode::canBeInPlace() const {
- size_t inPlaceWithParent = getParentEdges().size();
- for (size_t i = 0; i < inPlaceWithParent; i++) {
- auto parentEdge = getParentEdgeAt(i);
- if (!parentEdge->getParent()->isConstant() &&
- parentEdge->getParent()->getChildEdges().size() == 1) {
- inPlaceWithParent = i;
- break;
- }
- }
- // This is WA for MKLDNN implementation
- if (inPlaceWithParent != 0)
- return false;
- MKLDNNDims dims = getParentEdgeAt(0)->getDims();
- for (size_t cIdx = 0; cIdx < getChildEdges().size(); cIdx++) {
- if (getChildEdgeAt(cIdx)->getDims() != dims) {
- return false;
+ // Limitation: inputs precision definition inside Eltwise node assumes fusing is applied for 0-th port,
+ // otherwise we need identical precision on all inputs of fused node
+ for (int i = 1; i < eltwiseNode->getCnnLayer()->insData.size(); i++) {
+ if (eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision() != eltwiseNode->getCnnLayer()->insData[i].lock()->getPrecision()) {
+ return false;
+ }
+ }
}
+
+ return true;
}
- // Broadcast mode is complex for inplace usage
- // So will disable it
- if (broadcast) return false;
+ if (node->getType() == Quantize) {
+ auto *quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
+ if (quantizeNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
+ return !quantizeNode->isBinarization();
+ }
- return true;
+ return false;
}
+
REG_MKLDNN_PRIM_FOR(MKLDNNEltwiseNode, Eltwise);
#include <mkldnn_node.h>
#include <string>
#include <vector>
-#include <c_types_map.hpp>
#include <memory>
+#include <caseless.hpp>
namespace MKLDNNPlugin {
-struct jit_eltwise_fq_params {
- int src0_step;
- int src1_step;
- int dst_step;
- mkldnn::memory::data_type src0_dt;
- mkldnn::memory::data_type src1_dt;
- mkldnn::memory::data_type dst_dt;
- int src0_data_size;
- int src1_data_size;
- int dst_data_size;
-
- InferenceEngine::EltwiseLayer::eOperation eltwise_op;
+#define MAX_ELTWISE_INPUTS 7
+
+enum EltwiseOpType {
+ Add = 0,
+ Multiply,
+ Subtract,
+ Divide,
+ FloorMod,
+ Mod,
+ Maximum,
+ Minimum,
+ SquaredDifference,
+ PowerDynamic,
+ PowerStatic,
+ MulAdd,
+
+ Equal,
+ NotEqual,
+ Greater,
+ GreaterEqual,
+ Less,
+ LessEqual,
+
+ LogicalAnd,
+ LogicalOr,
+ LogicalXor,
+ LogicalNot,
+
+ Relu,
+ Gelu,
+ Elu,
+ Tanh,
+ Logistic,
+ Square,
+ Abs,
+ Sqrt,
+ Linear,
+ BoundedRelu,
+ SoftRelu,
+ Relu6,
+ Exp,
+ Clamp,
+ Swish,
+ Prelu,
+ Mish,
+ Hswish,
+ Hsigmoid
};
-struct jit_eltwise_fq_call_args {
- const void *src0;
- const void *src1;
+struct jit_eltwise_params {
+ size_t inputs_number;
+ size_t input_size;
+
+ InferenceEngine::Precision src_prc[MAX_ELTWISE_INPUTS];
+ InferenceEngine::Precision dst_prc;
+
+ std::vector<size_t> src_offsets[MAX_ELTWISE_INPUTS];
+ std::vector<size_t> dst_offsets;
+
+ size_t src_size[MAX_ELTWISE_INPUTS];
+ size_t dst_size;
+ size_t oc_size;
+};
+
+struct jit_eltwise_call_args {
+ const void *src_ptr[MAX_ELTWISE_INPUTS];
void *dst;
+
size_t work_amount;
+ size_t oc_off;
};
-struct jit_uni_eltwise_fq_kernel {
- void (*ker_)(const jit_eltwise_fq_call_args *);
+class MKLDNNEltwiseNode;
- void operator()(const jit_eltwise_fq_call_args *args) {
+struct jit_uni_eltwise_kernel {
+ void (*ker_)(const jit_eltwise_call_args *);
+
+ void operator()(const jit_eltwise_call_args *args) {
assert(ker_);
ker_(args);
}
- explicit jit_uni_eltwise_fq_kernel(jit_eltwise_fq_params jep, const mkldnn_primitive_attr &attr) : ker_(nullptr), jep_(jep), attr_(attr) {}
- virtual ~jit_uni_eltwise_fq_kernel() {}
+ explicit jit_uni_eltwise_kernel(jit_eltwise_params jep, MKLDNNEltwiseNode& node) : ker_(nullptr), jep_(jep), eltwiseNode(node) {}
+ virtual ~jit_uni_eltwise_kernel() {}
- jit_eltwise_fq_params jep_;
- const mkldnn_primitive_attr &attr_;
+ jit_eltwise_params jep_;
+ MKLDNNEltwiseNode& eltwiseNode;
};
class MKLDNNEltwiseNode : public MKLDNNNode {
void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
+ void selectOptimalPrimitiveDescriptor() override;
void createPrimitive() override;
void execute(mkldnn::stream strm) override;
bool created() const override;
bool canBeInPlace() const override;
bool isSum();
- bool isUnitScales();
bool isWithBroadcast();
- void initOptimalPrimitiveDescriptor() override;
+
+ bool canFuse(const MKLDNNNodePtr& node) const;
+
+ size_t getOpInputsNum() const;
+ EltwiseOpType getOpType() const { return eltwiseOp; }
+ mkldnn::algorithm getAlgorithm() const { return eltwiseAlgorithm; }
+
+ float getAlpha() const { return alpha; }
+ float getBeta() const { return beta; }
+
+ void appendPostOps(mkldnn::post_ops& ops) override;
private:
- InferenceEngine::EltwiseLayer::eOperation op;
- std::vector<float> sum_scales;
- bool broadcast = false;
- int batch_dim = 5;
- mkldnn::primitive_attr attr;
-
- std::shared_ptr<jit_uni_eltwise_fq_kernel> eltiwse_fq_kernel;
- jit_eltwise_fq_params jep;
-
- void jit_eltwise_fq();
- void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
-
- template <typename T0, typename T1> void ref_eltwise(int in0, int in1);
- template <typename T0, typename T1, typename T2> void ref_eltwise2(int in0, int in1);
- void dims_calc(int *dims, const MKLDNNDims &edge_dims, bool channels_first);
- void offset_out_calc(int *offset, int *dims);
- void offset_in_calc(int *offset, int *dims_in, int *dims_out);
-
- template <typename T0, typename T1> void eltwise_add(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_prod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_max(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_sub(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_min(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_div(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_squared_diff(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_floor_mod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_pow(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_logical_and(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_logical_or(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1> void eltwise_logical_xor(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size);
-
- template <typename T0, typename T1, typename T2> void eltwise_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1, typename T2> void eltwise_not_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1, typename T2> void eltwise_less(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1, typename T2> void eltwise_less_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1, typename T2> void eltwise_greater(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
- template <typename T0, typename T1, typename T2> void eltwise_greater_equal(const T0 *src0_ptr, const T1 *src1_ptr, T2 *dst_ptr, size_t dst_data_size);
+ void init() override;
+
+ EltwiseOpType eltwiseOp = Add;
+ mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm_undef;
+
+ std::shared_ptr<jit_uni_eltwise_kernel> eltwise_kernel = nullptr;
+ jit_eltwise_params jep = {};
+
+ int optimalTensorRank = 6;
+ bool canUseOptimizedImpl = false;
+ bool isDynBatchEnabled = false;
+ size_t batchDimIdx = 0;
+ size_t tensorRank = 0;
+ size_t fullWorkAmount = 0;
+ size_t schedulerWorkAmount = 0;
+ std::vector<std::vector<size_t>> dims_in = {};
+ std::vector<std::vector<size_t>> offsets_in = {};
+ std::vector<size_t> dims_out = {};
+ std::vector<size_t> offsets_out = {};
+ std::vector<ptrdiff_t> start_offset_in = {};
+ ptrdiff_t start_offset_out = 0;
+ std::vector<size_t> offsets_oc = {};
+
+ float alpha = 0;
+ float beta = 0;
+ float gamma = 0;
+
+ std::vector<float> scales = {};
+ std::vector<float> shifts = {};
+
+ inline void executeOptimized6D(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr);
+ inline void executeOptimizedGeneric(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr);
+ inline void executeReference(const std::vector<const uint8_t *>& src_ptrs, uint8_t *dst_ptr);
+
+ void offset_out_calc(std::vector<size_t>& offset, std::vector<size_t>& dims);
+ void offset_in_calc(std::vector<size_t>& offset, std::vector<size_t>& dims_in, std::vector<size_t>& dims_out);
+
+ static InferenceEngine::details::caseless_map<std::string,
+ std::function<void(InferenceEngine::GenericLayer*, EltwiseOpType&, mkldnn::algorithm&, float&, float&)>> initializers;
};
} // namespace MKLDNNPlugin
//
#include "mkldnn_fullyconnected_node.h"
-#include "mkldnn_activation_node.h"
-#include "mkldnn_depthwise_node.h"
+#include "mkldnn_eltwise_node.h"
#include "mkldnn_quantize_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
continue;
}
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode && (eltwiseNode->getOpType() == MulAdd || eltwiseNode->getOpType() == Prelu)) {
if (initWeights) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
+ auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(eltwiseNode->getCnnLayer().get());
int ndims = getParentEdgeAt(0)->getDims().ndims();
MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(ndims == 3 ? getChildEdgeAt(0)->getDims()[2] : getChildEdgeAt(0)->getDims()[1], 16))});
PostOpsIntBlobMemory[blob_idx]->FillZero();
// In case ndims == 3 graph optimizer allows fusing only if all weights values are the same
- if (depthwiseNode->isBroadcast() || ndims == 3) {
+ if (depthwiseLayer->blobs["weights"]->size() == 1 || ndims == 3) {
float broadcastValue = static_cast<float *>(depthwiseLayer->_weights->buffer())[0];
for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
}
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
+ if (eltwiseNode->getAlgorithm() == depthwise_scale_shift) {
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
// In case ndims == 3 graph optimizer allows fusing only if all biases values are the same
- if (depthwiseNode->isBroadcast() || ndims == 3) {
+ if (depthwiseLayer->blobs["biases"]->size() == 1 || ndims == 3) {
float broadcastValue = static_cast<float *>(depthwiseLayer->_biases->buffer())[0];
for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
}
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
+ ops.append_depthwise(eltwiseNode->getAlgorithm(),
(const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
(const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
blob_idx += 2;
} else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
+ ops.append_depthwise(eltwiseNode->getAlgorithm(),
(const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
nullptr);
blob_idx += 1;
}
} else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
+ ops.append_depthwise(eltwiseNode->getAlgorithm(),
nullptr,
nullptr);
}
continue;
}
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
- continue;
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
}
}
#include "mkldnn_interpolate_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
#include <legacy/ie_layers.h>
+#include "mkldnn_eltwise_node.h"
#include <mkldnn.hpp>
#include <string>
#include <vector>
continue;
}
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
- if (initWeights) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
- MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))});
-
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
-
- PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_weights->buffer(),
- depthwiseLayer->_weights->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
- }
- }
-
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
- memory::format::x);
- PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_biases->buffer(),
- depthwiseLayer->_biases->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
- }
- }
-
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
- blob_idx += 2;
- }
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- nullptr,
- nullptr);
- }
-
- continue;
- }
-
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
continue;
}
}
bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
- auto isOneOf = [](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+ auto isOneOf = [&](EltwiseOpType alg, std::vector<EltwiseOpType> algs) {
for (auto a : algs) {
if (alg == a) {
return true;
if (node->getType() == Quantize) {
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
if (quantizeNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
+ THROW_IE_EXCEPTION << "Cannot get quantize node " << node->getName();
return !quantizeNode->isBinarization();
- } else if (node->getType() == Depthwise) {
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode*>(node.get());
- if (depthwiseNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get depthwise layer " << node->getName();
- return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
- (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
- } else if (node->getType() == Activation) {
- auto* activationNode = dynamic_cast<MKLDNNActivationNode*>(node.get());
- if (activationNode == nullptr)
- THROW_IE_EXCEPTION << "Cannot get activation layer " << node->getName();
- return isOneOf(activationNode->getAlgorithm(), {eltwise_relu, eltwise_gelu, eltwise_elu, eltwise_logistic,
- eltwise_bounded_relu, eltwise_clamp, eltwise_tanh, eltwise_swish, eltwise_hswish, eltwise_mish, eltwise_hsigmoid,
- eltwise_linear, eltwise_abs, eltwise_square, eltwise_sqrt});
+ } else if (node->getType() == Eltwise) {
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
+ if (eltwiseNode == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
+ return isOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu, Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp,
+ Tanh, Swish, Hswish, Mish, Hsigmoid, Linear, Abs, Square, Sqrt});
}
+
return false;
}
#include "mkldnn_mvn_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
#include <legacy/ie_layers.h>
+#include "mkldnn_eltwise_node.h"
#include <mkldnn.hpp>
#include <string>
#include <vector>
continue;
}
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
- if (initWeights) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
- MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))});
-
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
- PostOpsIntBlobMemory[blob_idx]->FillZero();
-
- PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_weights->buffer(),
- depthwiseLayer->_weights->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
- }
- }
-
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
- memory::format::x);
- PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
- PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_biases->buffer(),
- depthwiseLayer->_biases->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
- }
- }
-
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
- blob_idx += 2;
- }
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- nullptr,
- nullptr);
- }
-
- continue;
- }
-
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
continue;
}
//
#include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
+#include "mkldnn_eltwise_node.h"
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers_internal.hpp>
#include "ie_parallel.hpp"
continue;
}
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
- if (initWeights) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
- MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getParentEdgeAt(0)->getDims()[1], 16))});
-
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
- PostOpsIntBlobMemory[blob_idx]->FillZero();
-
- PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_weights->buffer(),
- depthwiseLayer->_weights->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
- }
- }
-
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
- memory::format::x);
- PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
- PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_biases->buffer(),
- depthwiseLayer->_biases->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
- }
- }
-
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
- blob_idx += 2;
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- nullptr);
-
- blob_idx += 1;
- }
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- nullptr,
- nullptr);
- }
-
- continue;
- }
-
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
continue;
}
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_power_node.h"
-#include <legacy/ie_layers.h>
-#include <string>
-#include <cmath>
-#include <mkldnn_types.h>
-#include <mkldnn_extension_utils.h>
-#include <limits>
-#include "ie_parallel.hpp"
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-
-MKLDNNPowerNode::MKLDNNPowerNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
- : MKLDNNNode(layer, eng, cache), scale(1.0f), shift(1.0f), power(1.0f) {}
-
-void MKLDNNPowerNode::getSupportedDescriptors() {
- auto * powerLayer = dynamic_cast<PowerLayer*>(getCnnLayer().get());
-
- if (powerLayer == nullptr)
- THROW_IE_EXCEPTION << "Cannot convert power layer.";
- scale = powerLayer->scale;
- power = powerLayer->power;
- shift = powerLayer->offset;
-
- if (getParentEdges().size() != 1)
- THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
- if (getChildEdges().empty())
- THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
-}
-
-void MKLDNNPowerNode::initSupportedPrimitiveDescriptors() {
- if (!supportedPrimitiveDescriptors.empty())
- return;
-
- InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
- auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
- precision = getCnnLayer()->outData[0]->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
- auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
- InferenceEngine::LayerConfig config;
- config.dynBatchSupport = true;
- config.inConfs.resize(1);
- config.outConfs.resize(1);
- config.inConfs[0].inPlace = -1;
- config.inConfs[0].constant = false;
- config.outConfs[0].inPlace = -1;
- config.outConfs[0].constant = false;
- for (auto format : getAvailableFormatsForDims(getParentEdgeAt(0)->getDims())) {
- config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
- config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, format);
- if (format != memory::any) {
- config.inConfs[0].desc = InferenceEngine::TensorDesc(config.inConfs[0].desc.getPrecision(),
- config.inConfs[0].desc.getDims(), {
- config.inConfs[0].desc.getBlockingDesc().getBlockDims(),
- config.inConfs[0].desc.getBlockingDesc().getOrder(),
- (std::numeric_limits<size_t>::max)()
- });
- config.outConfs[0].desc = InferenceEngine::TensorDesc(config.outConfs[0].desc.getPrecision(),
- config.outConfs[0].desc.getDims(), {
- config.outConfs[0].desc.getBlockingDesc().getBlockDims(),
- config.outConfs[0].desc.getBlockingDesc().getOrder(),
- (std::numeric_limits<size_t>::max)()
- });
- }
- supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, format);
- }
-}
-
-void MKLDNNPowerNode::createPrimitive() {
- auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
- auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
- if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
- THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
- if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
- THROW_IE_EXCEPTION << "Input memory didn't allocate.";
- if (getSelectedPrimitiveDescriptor() == nullptr)
- THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
-}
-
-void MKLDNNPowerNode::execute(mkldnn::stream strm) {
- auto& srcMemory = getParentEdgeAt(0)->getMemory();
- auto& dstMemory = getChildEdgeAt(0)->getMemory();
- const size_t data_size = srcMemory.GetSize() / sizeof(float) / srcMemory.GetDims()[0] * batchToProcess();
-
- const auto *src_ptr = reinterpret_cast<const float*>(srcMemory.GetData()) +
- srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
- float *dst_ptr = reinterpret_cast<float*>(dstMemory.GetData()) +
- dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
-
- if (power == -1.f) {
- parallel_for(data_size, [&](size_t i) {
- float val = src_ptr[i] * scale + shift;
- dst_ptr[i] = 1 / val;
- });
- } else if (power == 0.5f) {
- parallel_for(data_size, [&](size_t i) {
- float val = src_ptr[i] * scale + shift;
- dst_ptr[i] = sqrtf(val);
- });
- } else if (power == 1.0f) {
- parallel_for(data_size, [&](size_t i) {
- dst_ptr[i] = src_ptr[i] * scale + shift;
- });
- } else if (power == 2.0f) {
- parallel_for(data_size, [&](size_t i) {
- float val = src_ptr[i] * scale + shift;
- dst_ptr[i] = val * val;
- });
- } else if (power == 3.0f) {
- parallel_for(data_size, [&](size_t i) {
- float val = src_ptr[i] * scale + shift;
- dst_ptr[i] = val * val * val;
- });
- } else {
- parallel_for(data_size, [&](size_t i) {
- dst_ptr[i] = pow(src_ptr[i] * scale + shift, power);
- });
- }
-}
-
-bool MKLDNNPowerNode::created() const {
- return getType() == Power;
-}
-REG_MKLDNN_PRIM_FOR(MKLDNNPowerNode, Power);
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include <string>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNPowerNode : public MKLDNNNode {
-public:
- MKLDNNPowerNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
- ~MKLDNNPowerNode() override = default;
-
- void getSupportedDescriptors() override;
- void initSupportedPrimitiveDescriptors() override;
- void createPrimitive() override;
- void execute(mkldnn::stream strm) override;
- bool created() const override;
-
-private:
- float scale;
- float shift;
- float power;
-};
-
-} // namespace MKLDNNPlugin
-
THROW_IE_EXCEPTION << "Quantize layer " << getName() << " has unsupported number of parent edges at port " << i;
}
- if (getParentEdgesAtPort(0)[0]->getDims().ndims() < 1ul || getParentEdgesAtPort(0)[0]->getDims().ndims() > 5ul) {
- THROW_IE_EXCEPTION << "Unsupported number of dimensions for input at edge 0 in Quantize layer " << getName();
- }
-
auto initAxisIdx = [&](size_t edgeIdx) {
auto edge = getParentEdgesAtPort(edgeIdx)[0];
}
void MKLDNNQuantizeNode::getSupportedDescriptors() {
+ if (getParentEdgesAtPort(0)[0]->getDims().ndims() < 1ul || getParentEdgesAtPort(0)[0]->getDims().ndims() > 5ul) {
+ THROW_IE_EXCEPTION << "Unsupported number of dimensions for input at edge 0 in Quantize layer " << getName();
+ }
+
mkldnn::memory::data_type idt = MKLDNNExtensionUtils::IEPrecisionToDataType(getInputPrecision());
mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
mkldnn::memory::data_type ddt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOutputPrecision());
#include "mkldnn_reduce_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
#include <legacy/ie_layers.h>
#include <mkldnn.hpp>
#include <string>
#include <vector>
+#include <set>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include "ie_parallel.hpp"
#include "mkldnn_resample_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
#include <legacy/ie_layers.h>
+#include "mkldnn_eltwise_node.h"
#include <mkldnn.hpp>
#include <string>
#include <vector>
continue;
}
- auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
- if (depthwiseNode) {
- if (initWeights) {
- auto* depthwiseLayer = reinterpret_cast<WeightableLayer*>(depthwiseNode->getCnnLayer().get());
- MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(getChildEdgeAt(0)->getDims()[1], 16))});
-
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
- PostOpsIntBlobMemory[blob_idx]->FillZero();
-
- PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_weights->buffer(),
- depthwiseLayer->_weights->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
- }
- }
-
- if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) {
- PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
- PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32,
- memory::format::x);
- PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
- PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
- depthwiseLayer->_biases->buffer(),
- depthwiseLayer->_biases->size() *
- MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
-
- if (depthwiseNode->isBroadcast()) {
- float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
- for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
- static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
- }
- }
-
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
- (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
-
- blob_idx += 2;
- }
- } else {
- ops.append_depthwise(depthwiseNode->getAlgorithm(),
- nullptr,
- nullptr);
- }
-
- continue;
- }
-
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
- if (activationNode) {
- ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), activationNode->getBeta());
-
+ auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+ if (eltwiseNode) {
+ eltwiseNode->appendPostOps(ops);
continue;
}
#include "mkldnn_scatter_update_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
-#include "mkldnn_depthwise_node.h"
-#include "mkldnn_activation_node.h"
#include <legacy/ie_layers.h>
#include <mkldnn.hpp>
#include <string>
};
std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypes = {
+ ngraph::helpers::EltwiseTypes::ADD,
ngraph::helpers::EltwiseTypes::MULTIPLY,
ngraph::helpers::EltwiseTypes::SUBTRACT,
- ngraph::helpers::EltwiseTypes::ADD
+ ngraph::helpers::EltwiseTypes::DIVIDE,
+ ngraph::helpers::EltwiseTypes::FLOOR_MOD,
+ ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
+ ngraph::helpers::EltwiseTypes::POWER,
+ ngraph::helpers::EltwiseTypes::MOD
};
std::map<std::string, std::string> additional_config = {};
R"(.*(QuantGroupConv3D).*)",
// TODO: Issue 31845
R"(.*(FakeQuantizeLayerTest).*)",
- R"(.*(EltwiseLayerTest).*IS=\(.*\..*\..*\..*\..*\).*secondaryInputType=PARAMETER.*opType=SCALAR.*)",
// TODO: failed to downgrade to opset v0 in interpreter backend
R"(.*Gather.*axis=-1.*)",
// TODO: Issue 33151
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <single_layer_tests/eltwise.hpp>
+#include <ngraph_functions/builders.hpp>
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+ LayerTestsDefinitions::EltwiseTestParams,
+ CPUSpecificParams> EltwiseLayerCPUTestParamsSet;
+
+class EltwiseLayerCPUTest : public testing::WithParamInterface<EltwiseLayerCPUTestParamsSet>,
+ virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+ static std::string getTestCaseName(testing::TestParamInfo<EltwiseLayerCPUTestParamsSet> obj) {
+ LayerTestsDefinitions::EltwiseTestParams basicParamsSet;
+ CPUSpecificParams cpuParams;
+ std::tie(basicParamsSet, cpuParams) = obj.param;
+
+ std::ostringstream result;
+ result << LayerTestsDefinitions::EltwiseLayerTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::EltwiseTestParams>(
+ basicParamsSet, 0));
+ result << CPUTestsBase::getTestCaseName(cpuParams);
+
+ return result.str();
+ }
+
+protected:
+ void SetUp() {
+ LayerTestsDefinitions::EltwiseTestParams basicParamsSet;
+ CPUSpecificParams cpuParams;
+ std::tie(basicParamsSet, cpuParams) = this->GetParam();
+
+ std::vector<std::vector<size_t>> inputShapes;
+ InferenceEngine::Precision netPrecision;
+ ngraph::helpers::InputLayerType secondaryInputType;
+ CommonTestUtils::OpType opType;
+ ngraph::helpers::EltwiseTypes eltwiseType;
+ std::map<std::string, std::string> additional_config;
+ std::tie(inputShapes, eltwiseType, secondaryInputType, opType, netPrecision, inPrc, outPrc, inLayout, targetDevice, additional_config) = basicParamsSet;
+ std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+ auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+ std::string isaType;
+ if (with_cpu_x86_avx512f()) {
+ isaType = "jit_avx512";
+ } else if (with_cpu_x86_avx2()) {
+ isaType = "jit_avx2";
+ } else if (with_cpu_x86_sse42()) {
+ isaType = "jit_sse42";
+ } else {
+ isaType = "ref";
+ }
+ selectedType = isaType + "_" + "FP32";
+
+ std::vector<size_t> inputShape1, inputShape2;
+ if (inputShapes.size() == 1) {
+ inputShape1 = inputShape2 = inputShapes.front();
+ } else if (inputShapes.size() == 2) {
+ inputShape1 = inputShapes.front();
+ inputShape2 = inputShapes.back();
+ } else {
+ THROW_IE_EXCEPTION << "Incorrect number of input shapes";
+ }
+
+ configuration.insert(additional_config.begin(), additional_config.end());
+ auto input = ngraph::builder::makeParams(ngPrc, {inputShape1});
+
+ std::vector<size_t> shape_input_secondary;
+ switch (opType) {
+ case CommonTestUtils::OpType::SCALAR: {
+ shape_input_secondary = std::vector<size_t>({1});
+ break;
+ }
+ case CommonTestUtils::OpType::VECTOR:
+ shape_input_secondary = inputShape2;
+ break;
+ default:
+ FAIL() << "Unsupported Secondary operation type";
+ }
+
+ std::shared_ptr<ngraph::Node> secondaryInput;
+ if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE ||
+ eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD ||
+ eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
+ std::vector<float> data(ngraph::shape_size(shape_input_secondary));
+ data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary));
+ for (float &i : data) {
+ if (i == 0) {
+ i = 1;
+ }
+ }
+ secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data);
+ } else {
+ secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
+ if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
+ input.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondaryInput));
+ }
+ }
+
+ auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
+ eltwise->get_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority);
+ function = std::make_shared<ngraph::Function>(eltwise, input, "Eltwise");
+ }
+};
+
+TEST_P(EltwiseLayerCPUTest, CompareWithRefs) {
+ SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+ Run();
+ CheckCPUImpl(executableNetwork, "Eltwise", inFmts, outFmts, selectedType);
+}
+
+namespace {
+
+std::vector<ngraph::helpers::InputLayerType> secondaryInputTypes = {
+ ngraph::helpers::InputLayerType::CONSTANT,
+ ngraph::helpers::InputLayerType::PARAMETER,
+};
+
+std::vector<CommonTestUtils::OpType> opTypes = {
+ CommonTestUtils::OpType::VECTOR,
+};
+
+std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypes = {
+ ngraph::helpers::EltwiseTypes::ADD,
+ ngraph::helpers::EltwiseTypes::MULTIPLY,
+ // TODO: Disabled because memory formats filter is not propogated through ngraph transformations
+// ngraph::helpers::EltwiseTypes::SUBTRACT,
+// ngraph::helpers::EltwiseTypes::DIVIDE,
+ ngraph::helpers::EltwiseTypes::FLOOR_MOD,
+ ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
+};
+
+std::map<std::string, std::string> additional_config = {};
+
+std::vector<CPUSpecificParams> filterCPUSpecificParams(std::vector<CPUSpecificParams>& paramsVector) {
+ auto adjustBlockedFormatByIsa = [](std::vector<cpu_memory_format_t>& formats) {
+ for (int i = 0; i < formats.size(); i++) {
+ if (formats[i] == nChw16c)
+ formats[i] = nChw8c;
+ if (formats[i] == nCdhw16c)
+ formats[i] = nCdhw8c;
+ }
+ };
+
+ if (!with_cpu_x86_avx512f()) {
+ for (auto& param : paramsVector) {
+ adjustBlockedFormatByIsa(std::get<0>(param));
+ adjustBlockedFormatByIsa(std::get<1>(param));
+ }
+ }
+
+ return paramsVector;
+}
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D = {
+ {{2, 4, 4, 1}},
+ {{2, 17, 5, 4}},
+ {{2, 17, 5, 4}, {1, 17, 1, 1}},
+ {{2, 17, 5, 1}, {1, 17, 1, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D = {
+ CPUSpecificParams({nChw16c, nChw16c}, {nChw16c}, {}, {}),
+ CPUSpecificParams({nhwc, nhwc}, {nhwc}, {}, {}),
+ CPUSpecificParams({nchw, nchw}, {nchw}, {}, {})
+};
+
+const auto params_4D_FP32 = ::testing::Combine(
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes_4D),
+ ::testing::ValuesIn(eltwiseOpTypes),
+ ::testing::ValuesIn(secondaryInputTypes),
+ ::testing::ValuesIn(opTypes),
+ ::testing::Values(InferenceEngine::Precision::FP32),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Layout::ANY),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(additional_config)),
+ ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D = {
+ {{2, 4, 3, 4, 1}},
+ {{2, 17, 7, 5, 4}},
+ {{2, 17, 6, 5, 4}, {1, 17, 6, 1, 1}},
+ {{2, 17, 6, 5, 1}, {1, 17, 1, 1, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D = {
+ CPUSpecificParams({nCdhw16c, nCdhw16c}, {nCdhw16c}, {}, {}),
+ CPUSpecificParams({ndhwc, ndhwc}, {ndhwc}, {}, {}),
+ CPUSpecificParams({ncdhw, ncdhw}, {ncdhw}, {}, {})
+};
+
+const auto params_5D_FP32 = ::testing::Combine(
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes_5D),
+ ::testing::ValuesIn(eltwiseOpTypes),
+ ::testing::ValuesIn(secondaryInputTypes),
+ ::testing::ValuesIn(opTypes),
+ ::testing::Values(InferenceEngine::Precision::FP32),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Layout::ANY),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(additional_config)),
+ ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D_Blocked_Planar = {
+ {{2, 17, 31, 3}, {2, 1, 31, 3}},
+ {{2, 17, 5, 1}, {2, 1, 1, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D_Blocked_Planar = {
+ CPUSpecificParams({nChw16c, nchw}, {nChw16c}, {}, {}),
+};
+
+const auto params_4D_FP32_Blocked_Planar = ::testing::Combine(
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes_4D_Blocked_Planar),
+ ::testing::ValuesIn(eltwiseOpTypes),
+ ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+ ::testing::ValuesIn(opTypes),
+ ::testing::Values(InferenceEngine::Precision::FP32),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Layout::ANY),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(additional_config)),
+ ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Planar)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_Blocked_Planar, EltwiseLayerCPUTest, params_4D_FP32_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D_Planar_Blocked = {
+ {{2, 1, 31, 3}, {2, 17, 31, 3}},
+ {{2, 1, 1, 4}, {2, 17, 5, 1}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D_Planar_Blocked = {
+ CPUSpecificParams({nchw, nChw16c}, {nChw16c}, {}, {}),
+};
+
+const auto params_4D_FP32_Planar_Blocked = ::testing::Combine(
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes_4D_Planar_Blocked),
+ ::testing::ValuesIn(eltwiseOpTypes),
+ ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+ ::testing::ValuesIn(opTypes),
+ ::testing::Values(InferenceEngine::Precision::FP32),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Layout::ANY),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(additional_config)),
+ ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Planar_Blocked)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_Planar_Blocked, EltwiseLayerCPUTest, params_4D_FP32_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D_Blocked_Planar = {
+ {{2, 17, 31, 4, 3}, {2, 1, 31, 1, 3}},
+ {{2, 17, 5, 3, 1}, {2, 1, 1, 3, 4}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D_Blocked_Planar = {
+ CPUSpecificParams({nCdhw16c, ncdhw}, {nCdhw16c}, {}, {}),
+};
+
+const auto params_5D_FP32_Blocked_Planar = ::testing::Combine(
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes_5D_Blocked_Planar),
+ ::testing::ValuesIn(eltwiseOpTypes),
+ ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+ ::testing::ValuesIn(opTypes),
+ ::testing::Values(InferenceEngine::Precision::FP32),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Layout::ANY),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(additional_config)),
+ ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Blocked_Planar)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_Blocked_Planar, EltwiseLayerCPUTest, params_5D_FP32_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName);
+
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D_Planar_Blocked = {
+ {{2, 1, 31, 1, 3}, {2, 17, 31, 4, 3}},
+ {{2, 1, 1, 3, 4}, {2, 17, 5, 3, 1}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D_Planar_Blocked = {
+ CPUSpecificParams({ncdhw, nCdhw16c}, {nCdhw16c}, {}, {}),
+};
+
+const auto params_5D_FP32_Planar_Blocked = ::testing::Combine(
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes_5D_Planar_Blocked),
+ ::testing::ValuesIn(eltwiseOpTypes),
+ ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT),
+ ::testing::ValuesIn(opTypes),
+ ::testing::Values(InferenceEngine::Precision::FP32),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+ ::testing::Values(InferenceEngine::Layout::ANY),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(additional_config)),
+ ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Planar_Blocked)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_Planar_Blocked, EltwiseLayerCPUTest, params_5D_FP32_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include <debug.h>
+#include <functional_test_utils/layer_test_utils.hpp>
+#include <ngraph_functions/builders.hpp>
+#include <ie_precision.hpp>
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/precision_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+#include "ie_system_conf.h"
+
+using namespace CPUTestUtils;
+using InferenceEngine::Precision;
+using ngraph::helpers::EltwiseTypes;
+using FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+ std::vector<std::vector<size_t>>, // Input shapes
+ std::vector<InferenceEngine::Precision>, // Input precisions
+ std::vector<EltwiseTypes>, // Eltwise operations
+ bool, // With quantization
+ std::string // Device name
+> EltwiseChainTuple;
+
+class EltwiseChainTest : public testing::WithParamInterface<EltwiseChainTuple>,
+ virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+ static std::string getTestCaseName(const testing::TestParamInfo<EltwiseChainTuple> &obj) {
+ std::vector<std::vector<size_t>> inputShapes;
+ std::vector<InferenceEngine::Precision> inputPrecisions;
+ std::vector<EltwiseTypes> eltwiseOpTypes;
+ bool withQuantization;
+ std::string targetName;
+ std::tie(inputShapes, inputPrecisions, eltwiseOpTypes, withQuantization, targetName) = obj.param;
+ std::ostringstream results;
+
+ for (int i = 0; i < inputShapes.size(); i++) {
+ results << "IS" << std::to_string(i) << "=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+ }
+ for (int i = 0; i < inputPrecisions.size(); i++) {
+ results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i].name() << "_";
+ }
+ for (int i = 0; i < eltwiseOpTypes.size(); i++) {
+ results << "Op" << std::to_string(i) << "=" << eltwiseOpTypes[i] << "_";
+ }
+
+ results << "WithQuant=" << withQuantization << "_";
+ results << "targetDevice=" << targetName;
+
+ return results.str();
+ }
+
+protected:
+ void SetUp() {
+ threshold = 0.1f;
+
+ std::vector<std::vector<size_t>> inputShapes;
+ std::vector<InferenceEngine::Precision> inputPrecisions;
+ std::vector<EltwiseTypes> eltwiseOpTypes;
+ bool withQuantization;
+ std::tie(inputShapes, inputPrecisions, eltwiseOpTypes, withQuantization, targetDevice) = this->GetParam();
+
+ auto ngraphParam = ngraph::builder::makeParams(convertIE2nGraphPrc(inputPrecisions[0]), {inputShapes[0]});
+
+ std::vector<std::shared_ptr<ngraph::Node>> ngraphInputs;
+ for (int i = 1; i < inputPrecisions.size(); i++) {
+ std::vector<float> ngraphInput1Data(ngraph::shape_size(ngraph::Shape{inputShapes[i]}));
+ ngraphInputs.push_back(ngraph::builder::makeConstant(convertIE2nGraphPrc(inputPrecisions[i]), ngraph::Shape{inputShapes[i]},
+ ngraphInput1Data, true));
+ }
+
+ if (withQuantization) {
+ std::vector<std::shared_ptr<ngraph::Node>> eltwiseOps;
+ eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0]));
+ for (int i = 1; i < eltwiseOpTypes.size() - 1; i++) {
+ eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i]));
+ }
+
+ std::vector<size_t> constShape(inputShapes[0].size(), 1);
+ constShape[1] = inputShapes[0][1];
+ auto fq = ngraph::builder::makeFakeQuantize(eltwiseOps[eltwiseOps.size() - 1],
+ ::ngraph::element::Type(::ngraph::element::Type_t::f32),
+ 256, constShape);
+
+ eltwiseOps.push_back(ngraph::builder::makeEltwise(fq, ngraphInputs[eltwiseOpTypes.size() - 1], eltwiseOpTypes[eltwiseOpTypes.size() - 1]));
+
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(eltwiseOps[eltwiseOps.size() - 1])};
+ function = std::make_shared<ngraph::Function>(results, ngraphParam, "eltwise_chain_fq");
+ } else {
+ std::vector<std::shared_ptr<ngraph::Node>> eltwiseOps;
+ eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0]));
+ for (int i = 1; i < eltwiseOpTypes.size(); i++) {
+ eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i]));
+ }
+
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(eltwiseOps[eltwiseOps.size() - 1])};
+ function = std::make_shared<ngraph::Function>(results, ngraphParam, "eltwise_chain");
+ }
+ }
+};
+
+TEST_P(EltwiseChainTest, CompareWithRefs) {
+ SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+ Run();
+}
+
+namespace {
+
+std::vector<std::vector<std::vector<size_t>>> inputShapes {
+ {
+ {{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}},
+ {{1, 48, 5, 6}, {1, 48, 1, 1}, {1, 48, 5, 6}, {1, 1, 5, 6}},
+ {{1, 72, 28, 28}, {1, 72, 1, 1}, {1, 72, 1, 1}, {1, 72, 1, 1}},
+ {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}},
+ {{1, 2, 3}, {3}, {3}, {3}},
+ {{1, 12, 5, 5}, {5, 5}, {12, 5, 5}, {1}},
+ {{3, 12, 5, 5}, {1, 12, 5, 1}, {3, 1, 1, 1}, {3, 12, 5, 5}},
+ {{1, 1, 1, 1}, {1, 12, 5, 1}, {3, 12, 1, 5}, {3, 12, 5, 1}},
+ {{1, 1, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}
+ }
+};
+
+std::vector<std::vector<InferenceEngine::Precision>> inputPrecisions = {
+ { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 },
+ { Precision::I32, Precision::I32, Precision::I32, Precision::I32 }
+};
+
+std::vector<std::vector<EltwiseTypes>> eltwiseOps = {
+ { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT },
+ { EltwiseTypes::DIVIDE, EltwiseTypes::POWER, EltwiseTypes::ADD },
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseChain, EltwiseChainTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(inputShapes),
+ ::testing::ValuesIn(inputPrecisions),
+ ::testing::ValuesIn(eltwiseOps),
+ ::testing::Values(false),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ EltwiseChainTest::getTestCaseName);
+
+std::vector<std::vector<std::vector<size_t>>> inputShapesFQ {
+ {
+ {{1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}},
+ {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}},
+ {{2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}},
+ {{2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}},
+ {{2, 5, 7, 5}, {2, 5, 1, 5}, {2, 5, 7, 5}, {2, 5, 7, 5}},
+ {{2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}},
+ {{2, 256, 7, 5}, {2, 256, 7, 5}, {2, 256, 1, 5}, {2, 256, 7, 5}},
+ {{1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}},
+ {{1, 12, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}},
+ {{1, 12, 1, 1, 6}, {1, 12, 5, 5, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 5, 1}},
+ {{1, 12, 1, 1, 1}, {1, 12, 5, 1, 7}, {3, 12, 1, 5, 7}, {3, 12, 5, 1, 7}},
+ {{1, 7, 1, 1, 12}, {1, 7, 5, 1, 12}, {3, 7, 1, 5, 12}, {3, 7, 5, 1, 12}},
+ {{1, 7, 1, 1, 12, 3, 7}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 3, 7}, {3, 7, 5, 1, 12, 3, 7}},
+ {{1, 7, 1, 1, 12, 3, 1}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 1, 7}, {3, 7, 5, 1, 12, 3, 1}}
+ }
+};
+
+std::vector<std::vector<InferenceEngine::Precision>> inputPrecisionsFQ {
+ { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 }
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseChainWithFQ, EltwiseChainTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(inputShapesFQ),
+ ::testing::ValuesIn(inputPrecisionsFQ),
+ ::testing::ValuesIn(eltwiseOps),
+ ::testing::Values(true),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ EltwiseChainTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
FAIL() << "Unsupported Secondary operation type";
}
- auto secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
- if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
- input.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondaryInput));
+ std::shared_ptr<ngraph::Node> secondaryInput;
+ if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE ||
+ eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD ||
+ eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
+ std::vector<float> data(ngraph::shape_size(shape_input_secondary));
+ data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape_input_secondary));
+ for (float &i : data) {
+ if (i == 0) {
+ i = 1;
+ }
+ }
+ secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data);
+ } else {
+ secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary);
+ if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) {
+ input.push_back(std::dynamic_pointer_cast<ngraph::opset3::Parameter>(secondaryInput));
+ }
}
+
auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
function = std::make_shared<ngraph::Function>(eltwise, input, "Eltwise");
}
case ngraph::helpers::EltwiseTypes::FLOOR_MOD:
os << "FloorMod";
break;
+ case ngraph::helpers::EltwiseTypes::MOD:
+ os << "Mod";
+ break;
default:
throw std::runtime_error("NOT_SUPPORTED_OP_TYPE");
}
return str_op;
}
-class MKLDNNGraphEltwise3InputsTests: public TestsCommon,
- public WithParamInterface<eltwise_test_params> {
- std::string model_t = R"V0G0N(
-<net name="EltwiseOnly" version="3" precision="FP32" batch="1">
- <layers>
- <layer name="in1" type="Input" precision="FP32" id="1">
- <output>
- <port id="1">__SRC_DIMS_1__
- </port>
- </output>
- </layer>
- <layer name="in2" type="Input" precision="FP32" id="2">
- <output>
- <port id="2">__SRC_DIMS_2__
- </port>
- </output>
- </layer>
- <layer name="in3" type="Input" precision="FP32" id="3">
- <output>
- <port id="3">__SRC_DIMS_3__
- </port>
- </output>
- </layer>
- <layer name="con" id="4" type="Eltwise" precision="FP32">
- <data operation="_OP_" _COEFF_/>
- <input>
- <port id="1">__SRC_DIMS_1__
- </port>
- <port id="2">__SRC_DIMS_2__
- </port>
- <port id="3">__SRC_DIMS_3__
- </port>
- </input>
- <output>
- <port id="4">__SRC_DIMS__
- </port>
- </output>
- </layer>
- </layers>
- <edges>
- <edge from-layer="1" from-port="1" to-layer="4" to-port="1"/>
- <edge from-layer="2" from-port="2" to-layer="4" to-port="2"/>
- <edge from-layer="3" from-port="3" to-layer="4" to-port="3"/>
- </edges>
-</net>
-)V0G0N";
-
-protected:
- std::string getModel(eltwise_test_params p) {
- std::string model = model_t;
- std::string op = select_op(p.op);
-
- std::string src_dims1;
- for (auto &dim : p.dims1) {
- src_dims1 += "\n <dim>";
- src_dims1 += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1);
-
- std::string src_dims2;
- for (auto &dim : p.dims2) {
- src_dims2 += "\n <dim>";
- src_dims2 += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2);
-
- std::string src_dims3;
- for (auto &dim : p.dims3) {
- src_dims3 += "\n <dim>";
- src_dims3 += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS_3__", src_dims3);
-
- std::string src_dims;
- std::vector<size_t> dims = p.dims1;
- for (int i = 0; i < dims.size(); i++) {
- dims[i] = std::max(p.dims1[i], p.dims2[i]);
- dims[i] = std::max(dims[i], p.dims3[i]);
- }
- for (auto &dim : dims) {
- src_dims += "\n <dim>";
- src_dims += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims);
-
- std::string scale;
- if (!p.scales.empty()) {
- scale = std::string("coeff=\"") + to_string_c_locale(p.scales) + std::string("\"");
- }
- REPLACE_WITH_STR(model, "_OP_", op);
- REPLACE_WITH_STR(model, "_COEFF_", scale);
-
- return model;
- }
-
- virtual void TearDown() {
- }
-
- virtual void SetUp() {
- try {
- TestsCommon::SetUp();
- eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
- std::string model = getModel(p);
-
- InferenceEngine::Core core;
- InferenceEngine::CNNNetwork network;
- ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
- MKLDNNGraphTestClass graph;
- graph.CreateGraph(network);
-
- auto& nodes = graph.getNodes();
- for (int i = 0; i < nodes.size(); i++) {
- if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) {
- ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
- for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
- p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
- }
- ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
- ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
- }
- }
- InferenceEngine::SizeVector dims_src1 = p.dims1;
- InferenceEngine::Layout layout1 = InferenceEngine::ANY;
- switch (p.dims1.size()) {
- case 4:
- layout1 = InferenceEngine::NCHW;
- break;
- case 5:
- layout1 = InferenceEngine::NCDHW;
- break;
- }
- InferenceEngine::SizeVector dims_src2 = p.dims2;
- InferenceEngine::Layout layout2 = InferenceEngine::ANY;
- switch (p.dims2.size()) {
- case 4:
- layout2 = InferenceEngine::NCHW;
- break;
- case 5:
- layout2 = InferenceEngine::NCDHW;
- break;
- }
- InferenceEngine::SizeVector dims_src3 = p.dims3;
- InferenceEngine::Layout layout3 = InferenceEngine::ANY;
- switch (p.dims3.size()) {
- case 4:
- layout3 = InferenceEngine::NCHW;
- break;
- case 5:
- layout3 = InferenceEngine::NCDHW;
- break;
- }
-
- InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src1, layout1});
- src1->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
-
- if (srcPtr1 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
- CommonTestUtils::fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1);
- InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src2, layout2});
- src2->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
-
- if (srcPtr2 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
- CommonTestUtils::fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2);
- InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src3, layout3});
- src3->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
-
- if (srcPtr3 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
- CommonTestUtils::fill_data_sine(src3->buffer(), src3->size(), 0.1, 0.9, 3);
- InferenceEngine::BlobMap srcs;
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in3", src3));
-
- InferenceEngine::OutputsDataMap out;
- out = network.getOutputsInfo();
- InferenceEngine::BlobMap outputBlobs;
-
- std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
- InferenceEngine::TBlob<float>::Ptr output;
- output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
- output->allocate();
- outputBlobs[item.first] = output;
-
- graph.Infer(srcs, outputBlobs);
-
- InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
- dst_ref.allocate();
-
- std::vector<InferenceEngine::TBlob<float>> src_vec = {*srcPtr1, *srcPtr2, *srcPtr3};
-
- ref_eltwise(src_vec, dst_ref, p);
-
- compare(*output, dst_ref, 0.0005f);
- } catch (const InferenceEngine::details::InferenceEngineException &e) {
- FAIL() << e.what();
- }
- }
-};
-
-TEST_P(MKLDNNGraphEltwise3InputsTests, TestsEltwise) {}
-
-
-INSTANTIATE_TEST_CASE_P(
- TestsEltwise, MKLDNNGraphEltwise3InputsTests,
- ::testing::Values(
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
- [](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
- ASSERT_EQ(3, impl.getConfig().inConfs.size());
- ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
- }
- } },
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref, {
- [](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
- ASSERT_EQ(3, impl.getConfig().inConfs.size());
- ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
- }
- } },
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref, {
- [](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
- ASSERT_EQ(3, impl.getConfig().inConfs.size());
- ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
- }
- } },
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
- [](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
- ASSERT_EQ(3, impl.getConfig().inConfs.size());
- ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
- }
- } },
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
- [](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
- ASSERT_EQ(3, impl.getConfig().inConfs.size());
- ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
- }
- } },
- eltwise_test_params{{1, 32, 16, 16, 16},{1, 32, 16, 16, 16},{1, 32, 16, 16, 16}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
- [](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
- ASSERT_EQ(3, impl.getConfig().inConfs.size());
- ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(1).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(2).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
- }
- } },
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref}
- ));
-
-class MKLDNNGraphEltwise2InputsTests: public TestsCommon,
- public WithParamInterface<eltwise_test_params> {
- std::string model_t = R"V0G0N(
-<net name="EltwiseOnly" version="2" precision="FP32">
- <layers>
- <layer name="in1" type="Input" precision="FP32" id="1">
- <output>
- <port id="1">__SRC_DIMS_1__
- </port>
- </output>
- </layer>
- <layer name="in2" type="Input" precision="FP32" id="2">
- <output>
- <port id="2">__SRC_DIMS_2__
- </port>
- </output>
- </layer>
- <layer name="con" id="3" type="Eltwise" precision="FP32">
- <data operation="_OP_" _COEFF_/>
- <input>
- <port id="1">__SRC_DIMS_1__
- </port>
- <port id="2">__SRC_DIMS_2__
- </port>
- </input>
- <output>
- <port id="3">__SRC_DIMS__
- </port>
- </output>
- </layer>
- </layers>
- <edges>
- <edge from-layer="1" from-port="1" to-layer="3" to-port="1"/>
- <edge from-layer="2" from-port="2" to-layer="3" to-port="2"/>
- </edges>
-</net>
-)V0G0N";
-
-protected:
- std::string getModel(eltwise_test_params p) {
- std::string model = model_t;
- std::string op = select_op(p.op);
-
- std::string src_dims1 = "";
- for (auto &dim : p.dims1) {
- src_dims1 += "\n <dim>";
- src_dims1 += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1);
-
- std::string src_dims2 = "";
- for (auto &dim : p.dims2) {
- src_dims2 += "\n <dim>";
- src_dims2 += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2);
-
- std::string src_dims;
- std::vector<size_t> dims = (p.dims1.size() >= p.dims2.size()) ? p.dims1 : p.dims2;
- int i = dims.size() - 1, j = p.dims1.size() - 1, k = p.dims2.size() - 1;
- for (; j >= 0 && k >= 0; i--, j--, k-- ) {
- dims[i] = std::max(p.dims1[j], p.dims2[k]);
- }
-
- for (auto &dim : dims) {
- src_dims += "\n <dim>";
- src_dims += std::to_string(dim) + "</dim>";
- }
- REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims);
-
- std::string scale;
- if (!p.scales.empty()) {
- scale = std::string("coeff=\"") + to_string_c_locale(p.scales) + std::string("\"");
- }
- REPLACE_WITH_STR(model, "_OP_", op);
- REPLACE_WITH_STR(model, "_COEFF_", scale);
-
- return model;
- }
-
- virtual void TearDown() {
- }
-
- virtual void SetUp() {
- try {
- TestsCommon::SetUp();
- eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
- std::string model = getModel(p);
-
- InferenceEngine::Core core;
- InferenceEngine::CNNNetwork network;
- ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
- MKLDNNGraphTestClass graph;
- graph.CreateGraph(network);
-
- auto& nodes = graph.getNodes();
- for (int i = 0; i < nodes.size(); i++) {
- if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) {
- ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
- for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
- p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
- }
- ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
- ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
- }
- }
- InferenceEngine::SizeVector dims_src1 = p.dims1;
- InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src1, InferenceEngine::TensorDesc::getLayoutByDims(p.dims1) });
- src1->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
-
- if (srcPtr1 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
-
- CommonTestUtils::fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1);
-
- InferenceEngine::SizeVector dims_src2 = p.dims2;
- InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src2, InferenceEngine::TensorDesc::getLayoutByDims(p.dims2) });
- src2->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
-
- if (srcPtr2 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
-
- CommonTestUtils::fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2);
-
- InferenceEngine::BlobMap srcs;
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
-
- InferenceEngine::OutputsDataMap out;
- out = network.getOutputsInfo();
- InferenceEngine::BlobMap outputBlobs;
-
- std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
- InferenceEngine::TBlob<float>::Ptr output;
- output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
- output->allocate();
- outputBlobs[item.first] = output;
-
- graph.Infer(srcs, outputBlobs);
-
- InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
- dst_ref.allocate();
-
- std::vector<InferenceEngine::TBlob<float>> src_vec = {*srcPtr1, *srcPtr2};
-
- ref_eltwise(src_vec, dst_ref, p);
-
- compare(*output, dst_ref, 0.0005f);
- } catch (const InferenceEngine::details::InferenceEngineException &e) {
- FAIL() << e.what();
- }
- }
-
-};
-
-TEST_P(MKLDNNGraphEltwise2InputsTests, TestsEltwise) {}
-
-INSTANTIATE_TEST_CASE_P(
- TestsEltwise, MKLDNNGraphEltwise2InputsTests,
- ::testing::Values(
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Less, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Less_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Greater, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Greater_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Equal, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Not_equal, "", 3, MKLDNNPlugin::impl_desc_type::ref}
- ));
-
-INSTANTIATE_TEST_CASE_P(
- TestsBroadcasting, MKLDNNGraphEltwise2InputsTests,
- ::testing::Values(
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Prod, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Max, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Min, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sub, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Div, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- // batch broadcasting
- eltwise_test_params{{1, 3, 224},{224, 3, 1},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{2, 3, 1, 2},{1, 3, 2, 1},{}, eltwise_test_params::opType::Sub, "", 1, MKLDNNPlugin::impl_desc_type::ref}
-
- ));
-
-INSTANTIATE_TEST_CASE_P(
- TestsDiffDims, MKLDNNGraphEltwise2InputsTests,
- ::testing::Values(
- eltwise_test_params{{},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3},{3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3},{3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3, 3},{},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}
- ));
-
-class MKLDNNGraphEltwiseDynBatchTests: public MKLDNNGraphEltwise3InputsTests {
-protected:
- virtual void SetUp() {
- try {
- TestsCommon::SetUp();
- eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
- std::string model = getModel(p);
- size_t MB = p.dims1[0];
- if (MB < 2)
- MB = 2;
-
- InferenceEngine::Core core;
- InferenceEngine::CNNNetwork network;
- ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
- auto implNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(&((InferenceEngine::ICNNNetwork&)network));
- ASSERT_NE(nullptr, implNet) << "Failed to cast ICNNNetwork to CNNNetworkImpl";
- InferenceEngine::ResponseDesc resp;
- InferenceEngine::StatusCode sts = implNet->setBatchSizeReshape(MB, &resp);
- ASSERT_EQ((int)InferenceEngine::StatusCode::OK, sts) << resp.msg;
-
- MKLDNNGraphTestClass graph;
- graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
- graph.CreateGraph(network);
-
- InferenceEngine::SizeVector dims_src1 = p.dims1;
- InferenceEngine::Layout layout1 = InferenceEngine::ANY;
- switch (p.dims1.size()) {
- case 4:
- layout1 = InferenceEngine::NCHW;
- break;
- case 5:
- layout1 = InferenceEngine::NCDHW;
- break;
- }
- InferenceEngine::SizeVector dims_src2 = p.dims2;
- InferenceEngine::Layout layout2 = InferenceEngine::ANY;
- switch (p.dims2.size()) {
- case 4:
- layout2 = InferenceEngine::NCHW;
- break;
- case 5:
- layout2 = InferenceEngine::NCDHW;
- break;
- }
- InferenceEngine::SizeVector dims_src3 = p.dims3;
- InferenceEngine::Layout layout3 = InferenceEngine::ANY;
- switch (p.dims3.size()) {
- case 4:
- layout3 = InferenceEngine::NCHW;
- break;
- case 5:
- layout3 = InferenceEngine::NCDHW;
- break;
- }
-
- InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src1, layout1});
- src1->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
-
- if (srcPtr1 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
-
- fill_data(src1->buffer(), src1->size());
- InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src2, layout2});
- src2->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
-
- if (srcPtr2 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
- fill_data(src2->buffer(), src2->size());
- InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src3, layout3});
- src3->allocate();
-
- InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
-
- if (srcPtr3 == nullptr)
- FAIL() << "Cannot cast blob to TBlob<float>.";
- fill_data(src3->buffer(), src3->size());
- InferenceEngine::BlobMap srcs;
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
- srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in3", src3));
-
- InferenceEngine::OutputsDataMap out;
- out = network.getOutputsInfo();
- InferenceEngine::BlobMap outputBlobs;
-
- std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
- InferenceEngine::TBlob<float>::Ptr output;
- output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
- output->allocate();
- outputBlobs[item.first] = output;
-
-
- auto checkDepthwise = [](const MKLDNNPlugin::MKLDNNNodePtr& node) {
- return node->getType() == MKLDNNPlugin::Eltwise;
- };
-
- graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkDepthwise);
- graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkDepthwise);
- } catch (const InferenceEngine::details::InferenceEngineException &e) {
- FAIL() << e.what();
- }
- }
-};
-
-TEST_P(MKLDNNGraphEltwiseDynBatchTests, TestsDynBatchEltwise) {}
-
-// TODO: rewrite to ngraph to have reshape functionality
-INSTANTIATE_TEST_CASE_P(
- DISABLED_TestsDynBatchEltwise, MKLDNNGraphEltwiseDynBatchTests,
- ::testing::Values(
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Pow, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref},
- eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref}
- ));
-
struct precisions_test_2params {
struct {
std::string precision0;
TestsEltwise2Precisions, MKLDNNGraphEltwise2PrecisionsTests,
::testing::Values(
precisions_test_2params{ {"FP32", "FP32"}, 4, 0 },
- precisions_test_2params{ { "U8", "FP32"}, 5, 1 },
- precisions_test_2params{ {"FP32", "U8"}, 5, 1 },
- precisions_test_2params{ { "U8", "U8"}, 6, 2 }
+ precisions_test_2params{ { "U8", "FP32"}, 4, 0 },
+ precisions_test_2params{ {"FP32", "U8"}, 4, 0 },
+ precisions_test_2params{ { "U8", "U8"}, 4, 0 }
));
graph.CreateGraph(network);
auto& nodes = graph.getNodes();
for (int i = 0; i < nodes.size(); i++) {
- if (nodes[i]->getType() == MKLDNNPlugin::Power) {
+ if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) {
ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
}
ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
- ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
}
}
power_test_params{
{1, 3, 13, 13}, 1, 2, 0.5f, 3, MKLDNNPlugin::impl_desc_type::unknown, {
[](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
ASSERT_EQ(1, impl.getConfig().inConfs.size());
ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
},
[](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
ASSERT_EQ(1, impl.getConfig().inConfs.size());
ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
},
[](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
ASSERT_EQ(1, impl.getConfig().inConfs.size());
ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
}}},
power_test_params{{1, 1, 23, 23}, 3, 8, 2, 3 },
power_test_params{{1, 8, 23, 23}, 8, 2, 1, 3 },
outputBlobs[item.first] = output;
auto checkPower = [](const MKLDNNPlugin::MKLDNNNodePtr& node) {
- return node->getType() == MKLDNNPlugin::Power;
+ return node->getType() == MKLDNNPlugin::Eltwise;
};
graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkPower);
graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkPower);
power_test_params{
{1, 3, 13, 13}, 1, 2, 0.5f, 3, MKLDNNPlugin::impl_desc_type::unknown, {
[](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
ASSERT_EQ(1, impl.getConfig().inConfs.size());
ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
},
[](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
ASSERT_EQ(1, impl.getConfig().inConfs.size());
ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
},
[](MKLDNNPlugin::PrimitiveDescInfo impl) {
- ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
ASSERT_EQ(1, impl.getConfig().inConfs.size());
ASSERT_EQ(1, impl.getConfig().outConfs.size());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
- ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
}}},
power_test_params{{1, 1, 23, 23}, 3, 8, 2, 3 },
power_test_params{{1, 8, 23, 23}, 8, 2, 1, 3 },
ASSERT_EQ(nodes.size(), 3);
ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution);
- ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+ ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Output);
} else {
ASSERT_EQ(nodes.size(), 5);
ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Reorder);
ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Convolution);
- ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+ ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reorder);
ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output);
}
for (auto &node : nodes) {
if (node->getType() == MKLDNNPlugin::Reorder) {
reorders_num++;
- ASSERT_EQ(MKLDNNPlugin::Output, node->getChildEdgeAt(0)->getChild()->getType());
}
}
- ASSERT_EQ(reorders_num, 1);
+ ASSERT_EQ(reorders_num, 3);
}
TEST_F(MKLDNNGraphStructureTests, TestRedundantReorderBeforeConvWithC_3) {
weights->allocate();
fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-
+
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-
+
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
memset((float *) weights->buffer(), 0, weights->size());
InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-
+
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
network = core.ReadNetwork(model, weights_ptr);
weights->allocate();
fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-
+
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
weights->allocate();
fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
-
+
InferenceEngine::Core core;
InferenceEngine::CNNNetwork network;
ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));
ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Reorder);
ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Convolution);
- ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+ ASSERT_TRUE(nodes[2].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reorder);
ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output);
ASSERT_EQ(nodes.size(), 4);
ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution);
- ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Activation));
+ ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Reorder);
ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Output);
ASSERT_EQ(nodes.size(), 3);
ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input);
ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Convolution);
- ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Depthwise));
+ ASSERT_TRUE(nodes[1].get()->isFusedWith(MKLDNNPlugin::Type::Eltwise));
ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Output);
InferenceEngine::TensorDesc src_desc(InferenceEngine::Precision::FP32, {1, 8, 300, 600}, InferenceEngine::NCHW);
#include <nodes/mkldnn_input_node.h>
#include <functional>
#include <cmath>
+#include <legacy/details/ie_cnn_network_tools.h>
#define GARB_VAL(x) ((x + 100.0f + sin(x)) / (x + 150.f))
return graphNodes;
}
+ void MoveInternalBlobsToConstLayers(InferenceEngine::details::CNNNetworkImpl* netImpl) {
+ auto createConstInputTo = [&](InferenceEngine::CNNLayerPtr layer, InferenceEngine::Blob::Ptr blob, std::string name) {
+ InferenceEngine::LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", InferenceEngine::Precision::FP32};
+ auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
+ constLayer->blobs["custom"] = blob;
+
+ std::vector<size_t> constDims(layer->insData[0].lock()->getDims().size(), 1);
+ if (constDims.size() > 1)
+ constDims[1] = blob.get()->size();
+ else
+ constDims[0] = blob.get()->size();
+ const InferenceEngine::TensorDesc& td = {InferenceEngine::Precision::FP32, constDims, InferenceEngine::TensorDesc::getLayoutByDims(constDims)};
+
+ InferenceEngine::DataPtr newEdgeAfterLayer(new InferenceEngine::Data(constLayer->name, td));
+ newEdgeAfterLayer->setName(constLayer->name);
+ getCreatorLayer(newEdgeAfterLayer) = constLayer;
+ getInputTo(newEdgeAfterLayer).clear();
+
+
+ netImpl->addData(constLayer->name.c_str(), newEdgeAfterLayer);
+ IE_SUPPRESS_DEPRECATED_START
+ netImpl->addLayer(constLayer);
+ IE_SUPPRESS_DEPRECATED_END
+
+ constLayer->outData.push_back(newEdgeAfterLayer);
+ getInputTo(newEdgeAfterLayer)[layer->name] = layer;
+ layer->insData.push_back(newEdgeAfterLayer);
+ };
+
+ auto all_layers = InferenceEngine::details::CNNNetSortTopologically(*netImpl);
+ for (auto &layer : all_layers) {
+ if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
+ InferenceEngine::Blob::Ptr scalesBlob = layer->blobs["weights"];
+ if (scalesBlob != nullptr)
+ createConstInputTo(layer, scalesBlob, "weights");
+
+ InferenceEngine::Blob::Ptr shiftBlob = layer->blobs["biases"];
+ if (shiftBlob != nullptr)
+ createConstInputTo(layer, shiftBlob, "biases");
+ } else if (layer->type == "PReLU" && layer->insData.size() == 1) {
+ InferenceEngine::Blob::Ptr scalesBlob = layer->blobs["weights"];
+ if (scalesBlob != nullptr)
+ createConstInputTo(layer, scalesBlob, "weights");
+ }
+ }
+ }
+
void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNPlugin::MKLDNNExtensionManager::Ptr& extMgr,
MKLDNNPlugin::MKLDNNWeightsSharing::Ptr cache = {}) {
if (network.getFunction()) {
auto convertedNetwork = std::make_shared<InferenceEngine::details::CNNNetworkImpl>(network);
+ MoveInternalBlobsToConstLayers(convertedNetwork.get());
MKLDNNGraph::CreateGraph(static_cast<InferenceEngine::ICNNNetwork&>(*convertedNetwork),
- extMgr, cache);
+ extMgr, cache);
} else {
+ InferenceEngine::details::CNNNetworkImpl* netImpl = dynamic_cast<InferenceEngine::details::CNNNetworkImpl*>(&network);
+ if (netImpl == nullptr) {
+ THROW_IE_EXCEPTION << "unexpected network type";
+ }
+ MoveInternalBlobsToConstLayers(netImpl);
MKLDNNGraph::CreateGraph(network, extMgr, cache);
}
}
MKLDNNPlugin::MKLDNNWeightsSharing::Ptr cache;
if (network.getFunction()) {
auto convertedNetwork = std::make_shared<InferenceEngine::details::CNNNetworkImpl>(network);
+ MoveInternalBlobsToConstLayers(convertedNetwork.get());
MKLDNNGraph::CreateGraph(static_cast<InferenceEngine::ICNNNetwork&>(*convertedNetwork),
extensionManager, cache);
} else {
+ InferenceEngine::details::CNNNetworkImpl* netImpl = dynamic_cast<InferenceEngine::details::CNNNetworkImpl*>(&network);
+ if (netImpl == nullptr) {
+ THROW_IE_EXCEPTION << "unexpected network type";
+ }
+ MoveInternalBlobsToConstLayers(netImpl);
MKLDNNGraph::CreateGraph(network, extensionManager, cache);
}
}
-Subproject commit 4b239023043318899e1c0a3b79158a68b7efe6e4
+Subproject commit d7d8ed46078b637794bc91215e1a982bb0f1683a
xfail_issue_38085 = xfail_test(reason="RuntimeError: Interpolate operation should be converted to Interp")
xfail_issue_38086 = xfail_test(reason="RuntimeError: Quantize layer input '<value>' doesn't have blobs")
xfail_issue_38087 = xfail_test(reason="RuntimeError: Cannot cast to tensor desc. Format is unsupported!")
-xfail_issue_38088 = xfail_test(reason="RuntimeError: Check '((axis >= axis_range_min) && "
- "(axis <= axis_range_max))' failed at "
- "/openvino/ngraph/core/src/validation_util.cpp:913: "
- "Split Parameter axis <value> out of the tensor rank range <value>.")
-xfail_issue_38089 = xfail_test(reason="RuntimeError: Node 2 contains empty child edge for index 0")
xfail_issue_38090 = xfail_test(reason="AssertionError: Items types are not equal")
xfail_issue_38091 = xfail_test(reason="AssertionError: Mismatched elements")
xfail_issue_38699 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations:"
skip_segfault,
xfail_issue_34327,
xfail_issue_36485,
- xfail_issue_35923,
xfail_issue_36486,
xfail_issue_34314,
xfail_issue_36487)
assert np.allclose(result, expected)
-@xfail_issue_35923
def test_prelu_operator():
runtime = get_runtime()
xfail_issue_33616,
xfail_issue_38086,
xfail_issue_38087,
- xfail_issue_35923,
xfail_issue_36483,
xfail_issue_34323,
xfail_issue_35915,
xfail_issue_36476,
xfail_issue_36478,
xfail_issue_36437,
- xfail_issue_38088,
- xfail_issue_38089,
xfail_issue_38090,
xfail_issue_38091,
xfail_issue_35929,
"OnnxBackendNodeModelTest.test_quantizelinear_cpu"),
(xfail_issue_38087,
"OnnxBackendNodeModelTest.test_convtranspose_1d_cpu"),
- (xfail_issue_35923,
- "OnnxBackendNodeModelTest.test_prelu_broadcast_cpu",
- "OnnxBackendNodeModelTest.test_prelu_example_cpu"),
(xfail_issue_36483,
"OnnxBackendNodeModelTest.test_ceil_cpu",
"OnnxBackendNodeModelTest.test_ceil_example_cpu"),
"OnnxBackendNodeModelTest.test_argmin_keepdims_example_select_last_index_cpu",
"OnnxBackendNodeModelTest.test_argmin_keepdims_random_select_last_index_cpu",
"OnnxBackendNodeModelTest.test_pow_types_float32_uint32_cpu"),
- (xfail_issue_38088,
- "OnnxBackendPyTorchConvertedModelTest.test_GLU_cpu"),
- (xfail_issue_38089,
- "OnnxBackendPyTorchConvertedModelTest.test_GLU_dim_cpu"),
(xfail_issue_38090,
"OnnxBackendNodeModelTest.test_where_long_example_cpu",
"OnnxBackendNodeModelTest.test_mod_int64_fmod_cpu",
import pytest
from tests.test_onnx.utils import run_node
-from tests import xfail_issue_35915
@pytest.mark.parametrize(
pytest.param("And", np.logical_and, np.bool),
pytest.param("Or", np.logical_or, np.bool),
pytest.param("Xor", np.logical_xor, np.bool),
- pytest.param("Equal", np.equal, np.int32, marks=xfail_issue_35915),
- pytest.param("Greater", np.greater, np.int32, marks=xfail_issue_35915),
- pytest.param("Less", np.less, np.int32, marks=xfail_issue_35915),
+ pytest.param("Equal", np.equal, np.int32),
+ pytest.param("Greater", np.greater, np.int32),
+ pytest.param("Less", np.less, np.int32),
],
)
def test_logical(onnx_op, numpy_func, data_type):
import pytest
from tests.test_onnx.utils import run_node
-from tests import xfail_issue_35918, xfail_issue_35923, xfail_issue_35924
+from tests import xfail_issue_35918, xfail_issue_35924
def import_and_compute(op_type, input_data, **node_attrs):
assert_onnx_import_equals_callable("LeakyRelu", leaky_relu, [[-3, -2, -1], [1, 2, 3]])
-@xfail_issue_35923
@pytest.mark.parametrize(
"x, slope",
[