1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "mkldnn_concat_node.h"
10 #include <mkldnn_extension_utils.h>
12 #include "details/ie_exception.hpp"
13 #include "ie_layers.h"
15 #include "mkldnn/iml_type_mapper.h"
16 #include "mkldnn_dims.h"
17 #include "mkldnn_edge.h"
18 #include "mkldnn_memory.h"
19 #include "ie_parallel.hpp"
22 using namespace mkldnn;
23 using namespace MKLDNNPlugin;
24 using namespace InferenceEngine;
26 MKLDNNConcatNode::MKLDNNConcatNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
28 void MKLDNNConcatNode::getSupportedDescriptors() {
29 auto * conLayer = dynamic_cast<ConcatLayer*>(getCnnLayer().get());
31 if (conLayer == nullptr)
32 THROW_IE_EXCEPTION << "Cannot convert concat layer.";
34 axis = conLayer->_axis;
36 if (getParentEdges().empty())
37 THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
38 if (getChildEdges().empty())
39 THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
40 auto& firstParentDims = getParentEdgeAt(0)->getDims();
41 for (size_t i = 1; i < getParentEdges().size(); i++) {
42 auto& dims = getParentEdgeAt(i)->getDims();
43 bool incorrectDims = false;
44 for (size_t j = 0; j < firstParentDims.ndims(); j++) {
47 if (dims.ndims() != firstParentDims.ndims() || firstParentDims[j] != dims[j]) {
52 if (incorrectDims || firstParentDims.ndims() == 0) {
53 THROW_IE_EXCEPTION << "Incorrect input dimensions for concat node " << getName();
58 void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
59 if (!supportedPrimitiveDescriptors.empty())
62 InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
63 auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision);
64 InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision();
65 auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
67 MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
68 InferenceEngine::LayerConfig config;
69 config.dynBatchSupport = true;
70 bool hasEltwise = false;
72 for (size_t i = 0; i < getParentEdges().size(); i++) {
73 auto parentEdge = getParentEdgeAt(i);
74 if (parentEdge->getParent()->getType() == Eltwise)
77 InferenceEngine::DataConfig dataConfig;
78 dataConfig.inPlace = -1;
79 dataConfig.constant = false;
80 dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, memory::format::any));
81 config.inConfs.push_back(dataConfig);
84 auto dims = getChildEdgeAt(0)->getDims();
86 config.outConfs.resize(1);
87 config.outConfs[0].inPlace = -1;
88 config.outConfs[0].constant = false;
89 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, MKLDNNMemory::GetPlainFormat(dims)));
90 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
91 if (dims.ndims() == 4) {
92 if (dims[1] % 8 == 0) {
93 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw8c));
94 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
96 if (dims[1] % 16 == 0) {
97 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw16c));
98 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
101 } else if (dims.ndims() == 5) {
102 if (dims[1] % 8 == 0) {
103 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
104 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
106 if (dims[1] % 16 == 0) {
107 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
108 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
113 if (axis != 1 || hasEltwise)
116 auto numOfDim = static_cast<size_t>(dstDims.ndims());
118 SizeVector order(numOfDim);
119 SizeVector offsets(numOfDim, 0lu);
120 size_t offset = std::numeric_limits<size_t>::max();
121 for (size_t i = 0; i < numOfDim; i++) {
125 if (this->getCnnLayer()->precision == Precision::I8) {
127 // Here we assume NHWC layout (channels are the last)
129 order = {0, 2, 3, 1};
130 offsets = {0, 0, 0, 0};
132 SizeVector blkDims = dstDims.ToSizeVector();
133 blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
135 SizeVector strides(numOfDim);
136 strides.resize(numOfDim);
137 // C is the last in NHWC, so all strides are max()
138 for (size_t i = 0; i < numOfDim; i++) {
139 strides[i] = std::numeric_limits<size_t>::max();
142 config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(),
143 dstDims.ToSizeVector(),
144 { blkDims, order, offset, offsets, strides });
145 for (size_t i = 0; i < getParentEdges().size(); i++) {
146 auto parentEdge = getParentEdgeAt(i);
148 SizeVector blkDims = parentEdge->getDims().ToSizeVector();
149 blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
151 config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NHWC in mkldnn
153 config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(),
154 {blkDims, order, offset, offsets, strides});
157 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
160 SizeVector strides(numOfDim);
161 strides[numOfDim - 1] = 1;
162 for (size_t i = 2; i <= numOfDim; i++) {
163 if (numOfDim - i < axis) {
164 strides[numOfDim - i] = std::numeric_limits<size_t>::max();
166 strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1];
170 config.outConfs[0].desc = TensorDesc(
171 MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
172 dstDims.ToSizeVector(),
173 {dstDims.ToSizeVector(), order, offset, offsets, strides});
174 for (size_t i = 0; i < getParentEdges().size(); i++) {
175 auto parentEdge = getParentEdgeAt(i);
176 config.inConfs[i].inPlace = 0;
177 config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
178 {parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides});
181 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
183 if (numOfDim == 4lu || numOfDim == 5lu) {
184 size_t blkDimsLen = numOfDim + 1;
185 order.resize(blkDimsLen);
186 for (size_t i = 0; i < numOfDim; i++) {
189 order[numOfDim] = 1lu;
190 offsets = SizeVector(blkDimsLen, 0lu);
192 // nChw8c, nChw16c, nCdhw8c, nCdhw16c
193 for (size_t sizeS : {8lu, 16lu}) {
194 SizeVector blkDims = dstDims.ToSizeVector();
195 if (blkDims[1] % sizeS)
197 blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
198 blkDims.push_back(sizeS);
200 strides.resize(blkDimsLen);
201 strides[blkDimsLen - 1] = 1;
202 for (size_t i = 2lu; i <= blkDimsLen; i++) {
203 if (blkDimsLen - i < axis) {
204 strides[blkDimsLen - i] = std::numeric_limits<size_t>::max();
206 strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
209 config.outConfs[0].desc = TensorDesc(
210 MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
211 dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
213 bool canInplace = true;
214 for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
215 auto parentEdge = getParentEdgeAt(i);
216 blkDims = parentEdge->getDims().ToSizeVector();
217 if (blkDims[1] % sizeS)
220 blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
221 blkDims.push_back(sizeS);
222 config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
223 {blkDims, order, offset, offsets, strides});
226 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
232 void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
233 InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
234 auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
235 precision = getCnnLayer()->outData[0]->getPrecision();
236 auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
238 bool hasUnknown = false;
239 std::vector<size_t> canSelectPrimitive;
240 for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
242 auto &primDescInfo = supportedPrimitiveDescriptors[i];
243 if (primDescInfo.getImplementationType() != impl_desc_type::unknown ||
244 primDescInfo.getConfig().inConfs[0].inPlace < 0)
247 for (auto iInfo : primDescInfo.getConfig().inConfs) {
248 if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
255 for (auto oInfo : primDescInfo.getConfig().outConfs) {
256 if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
264 canSelectPrimitive.push_back(i);
268 bool hasDoubleConnection = false;
269 for (int i = 0; i < getParentEdges().size(); i++) {
270 for (int j = i + 1; j < getParentEdges().size(); j++) {
271 if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true;
275 if (hasDoubleConnection) {
276 // The double connection marks that some tensor should
277 // be replicated. Inplace approach is not applicable
278 // for that case. Descriptor with index 0 is pure copy
280 selectPrimitiveDescriptorByIndex(0);
284 bool canOptimize = true;
285 for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
286 const auto& parent = getParentEdgeAt(i)->getParent();
287 for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) {
288 const auto& child = parent->getChildEdgeAt(j)->getChild();
289 const auto* childConcat = dynamic_cast<MKLDNNConcatNode *>(child.get());
290 if (!childConcat || childConcat == this)
292 if (childConcat->isOptimized())
296 if (hasUnknown && axis == 1) {
297 if (canSelectPrimitive.size() == 1) {
298 selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
305 std::map<mkldnn::memory::format, size_t> formatFrequency;
306 for (size_t i = 0; i < getParentEdges().size(); i++) {
307 auto parentEdge = getParentEdgeAt(i);
308 auto parent = parentEdge->getParent();
310 if (parent->getSelectedPrimitiveDescriptor() == nullptr)
313 int outputIndex = parentEdge->getOutputNum();
315 THROW_IE_EXCEPTION << "Cannot find index of output node";
316 if (outputIndex >= parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size())
318 auto outDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[outputIndex].desc);
321 if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
322 formatFrequency[outDesc.getFormat()] += 1;
324 formatFrequency[outDesc.getFormat()] = 1;
326 for (size_t i = 0; i < getChildEdges().size(); i++) {
327 auto childEdge = getChildEdgeAt(i);
328 auto child = childEdge->getChild();
329 if (child->getSelectedPrimitiveDescriptor() == nullptr)
331 int inputIndex = childEdge->getOutputNum();
333 THROW_IE_EXCEPTION << "Cannot find index of output node";
334 if (inputIndex >= child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size())
336 auto outDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[inputIndex].desc);
339 if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
340 formatFrequency[outDesc.getFormat()] += 1;
342 formatFrequency[outDesc.getFormat()] = 1;
346 mkldnn::memory::format convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
347 for (auto &it : formatFrequency) {
348 if (it.second > maxCount) {
349 maxCount = it.second;
350 convertTo = it.first;
354 if (canOptimize && MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, convertTo).blocksExtended())
355 convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
356 for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
357 if (MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, convertTo).blocksExtended())
358 convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
361 for (auto supportedPdIndex : canSelectPrimitive) {
362 if (MKLDNNMemoryDesc(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc).getFormat() == convertTo) {
363 selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
368 for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
369 auto &primDescInfo = supportedPrimitiveDescriptors[i];
370 if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
372 if (convertTo == MKLDNNMemoryDesc(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc).getFormat()) {
374 for (num = 0; num < getParentEdges().size(); num++) {
375 if (MKLDNNMemoryDesc(getParentEdgeAt(num)->getDims(), inputDataType, convertTo).blocksExtended())
378 if (num == getParentEdges().size()) {
379 selectPrimitiveDescriptorByIndex(i);
384 selectPrimitiveDescriptorByIndex(0);
387 bool MKLDNNConcatNode::created() const {
388 return getType() == Concatenation;
391 bool MKLDNNConcatNode::isOptimized() const {
392 return getSelectedPrimitiveDescriptor() && getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].inPlace >= 0;
395 void MKLDNNConcatNode::createPrimitive() {
396 if (prim || isOptimized())
399 auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
400 if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
401 THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
402 if (getSelectedPrimitiveDescriptor() == nullptr)
403 THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
405 std::vector<memory::primitive_desc> srcs_pd;
406 std::vector<primitive::at> srcs_p;
408 for (size_t i = 0; i < getParentEdges().size(); i++) {
409 auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
410 if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) {
411 auto parent = getParentEdgeAt(i)->getParent();
412 THROW_IE_EXCEPTION << "Source memory from " << parent->getName() << " didn't allocate for node "
416 auto desc = srcMemPtr->GetDescriptor();
417 auto dims = getParentEdgeAt(i)->getDims();
418 for (size_t j = 0; j < dims.ndims(); j++) {
419 desc.data.dims[j] = dims[j];
422 srcs_pd.emplace_back(desc, srcMemPtr->GetPrimitiveDescriptor().get_engine());
423 srcs_p.emplace_back(srcMemPtr->GetPrimitive());
426 auto desc = getChildEdgeAt(0)->getMemory().GetDescriptor();
427 auto dims = getChildEdgeAt(0)->getDims();
428 for (size_t i = 0; i < dims.ndims(); i++) {
429 desc.data.dims[i] = dims[i];
430 desc.data.layout_desc.blocking.padding_dims[i] = dims[i];
433 auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_pd);
435 prim.reset(new concat(primitive_desc, srcs_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
438 size_t MKLDNNConcatNode::inverseOrder(const SizeVector& order, size_t axis) {
439 for (size_t i = 0; i < order.size(); i++) {
440 if (axis == order[i]) {
447 void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
448 if (!isOptimized()) {
449 MKLDNNNode::initOptimalPrimitiveDescriptor();
453 auto config = getSelectedPrimitiveDescriptor()->getConfig();
454 if (isInitConfig(config))
457 for (size_t i = 0; i < config.outConfs.size(); i++) {
458 if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY ||
459 !isUninitTensorDesc(config.outConfs[i].desc))
462 int num = getChildEdgeAt(i)->getOutputNum();
464 auto childConf = getChildEdgeAt(i)->getChild()->getSelectedPrimitiveDescriptor()->getConfig().inConfs[num];
465 childConf.desc.setPrecision(config.outConfs[i].desc.getPrecision());
467 if (getChildEdgeAt(i)->getChild()->getSelectedPrimitiveDescriptor()) {
468 if (isUninitTensorDesc(childConf.desc) && childConf.inPlace >= 0)
469 getChildEdgeAt(i)->getChild()->initOptimalPrimitiveDescriptor();
471 if (!isUninitTensorDesc(childConf.desc) &&
472 MKLDNNExtensionUtils::initTensorsAreEqual(childConf.desc, config.outConfs[i].desc)) {
473 config.outConfs[i].desc = childConf.desc;
478 config.outConfs[i].desc = InferenceEngine::TensorDesc(config.outConfs[i].desc.getPrecision(),
479 config.outConfs[i].desc.getDims(), {
480 config.outConfs[i].desc.getBlockingDesc().getBlockDims(),
481 config.outConfs[i].desc.getBlockingDesc().getOrder()
485 for (size_t i = 0; i < config.inConfs.size(); i++) {
486 config.inConfs[i].desc = InferenceEngine::TensorDesc(config.inConfs[i].desc.getPrecision(),
487 config.inConfs[i].desc.getDims(), {
488 config.inConfs[i].desc.getBlockingDesc().getBlockDims(),
489 config.inConfs[i].desc.getBlockingDesc().getOrder(),
490 config.outConfs[0].desc.getBlockingDesc().getOffsetPadding() + offset,
491 config.outConfs[0].desc.getBlockingDesc().getOffsetPaddingToData(),
492 config.outConfs[0].desc.getBlockingDesc().getStrides()
496 if (config.inConfs[0].desc.getLayout() == Layout::NHWC) {
497 // This is more general and works for any "direct" Layout (such as nchw or nhwc), but it doesn't work for nchw8c
498 size_t realAxis = inverseOrder(config.inConfs[0].desc.getBlockingDesc().getOrder(), axis);
499 for (size_t j = realAxis; j < config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); j++) {
500 size_t jj = config.inConfs[0].desc.getBlockingDesc().getOrder()[j];
501 axisSize *= config.inConfs[i].desc.getBlockingDesc().getBlockDims()[jj];
504 // This works for nchw and nchw8c/nchw16c
505 for (size_t j = axis; j < config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); j++) {
506 axisSize *= config.inConfs[i].desc.getBlockingDesc().getBlockDims()[j];
511 initDescriptor(config);
514 void MKLDNNConcatNode::execute(mkldnn::stream strm) {
519 const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
520 const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
522 const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
525 uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
527 const size_t num_src = getParentEdges().size();
529 std::vector<size_t> channels;
530 size_t channels_size = 0;
531 std::vector<const uint8_t*> src_ptrs;
532 std::vector<uint8_t*> dst_ptrs;
534 for (size_t i = 0; i < num_src; i++) {
535 const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
536 const size_t num_channels = src_mem.GetDims()[1];
538 channels.push_back(num_channels);
539 src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
540 dst_ptrs.push_back(dst_ptr + channels_size);
541 channels_size += num_channels;
544 const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0];
546 parallel_for(iter_count, [&](int i) {
547 const size_t dst_off = i * channels_size;
548 for (int j = 0; j < num_src; j++) {
549 memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]);
553 MKLDNNNode::execute(strm);