Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / mkldnn_plugin / nodes / mkldnn_concat_node.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "mkldnn_concat_node.h"
6
7 #include <map>
8 #include <utility>
9 #include <vector>
10 #include <mkldnn_extension_utils.h>
11
12 #include "details/ie_exception.hpp"
13 #include "ie_layers.h"
14 #include "mkldnn.hpp"
15 #include "mkldnn/iml_type_mapper.h"
16 #include "mkldnn_dims.h"
17 #include "mkldnn_edge.h"
18 #include "mkldnn_memory.h"
19 #include "ie_parallel.hpp"
20 #include <limits>
21
22 using namespace mkldnn;
23 using namespace MKLDNNPlugin;
24 using namespace InferenceEngine;
25
26 MKLDNNConcatNode::MKLDNNConcatNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
27
28 void MKLDNNConcatNode::getSupportedDescriptors() {
29     auto * conLayer = dynamic_cast<ConcatLayer*>(getCnnLayer().get());
30
31     if (conLayer == nullptr)
32         THROW_IE_EXCEPTION << "Cannot convert concat layer.";
33
34     axis = conLayer->_axis;
35
36     if (getParentEdges().empty())
37         THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
38     if (getChildEdges().empty())
39         THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
40     auto& firstParentDims = getParentEdgeAt(0)->getDims();
41     for (size_t i = 1; i < getParentEdges().size(); i++) {
42         auto& dims = getParentEdgeAt(i)->getDims();
43         bool incorrectDims = false;
44         for (size_t j = 0; j < firstParentDims.ndims(); j++) {
45             if (j == axis)
46                 continue;
47             if (dims.ndims() != firstParentDims.ndims() || firstParentDims[j] != dims[j]) {
48                 incorrectDims = true;
49                 break;
50             }
51         }
52         if (incorrectDims || firstParentDims.ndims() == 0) {
53             THROW_IE_EXCEPTION << "Incorrect input dimensions for concat node " << getName();
54         }
55     }
56 }
57
58 void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
59     if (!supportedPrimitiveDescriptors.empty())
60         return;
61
62     InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
63     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision);
64     InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision();
65     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
66
67     MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
68     InferenceEngine::LayerConfig config;
69     config.dynBatchSupport = true;
70     bool hasEltwise = false;
71
72     for (size_t i = 0; i < getParentEdges().size(); i++) {
73         auto parentEdge = getParentEdgeAt(i);
74         if (parentEdge->getParent()->getType() == Eltwise)
75             hasEltwise = true;
76
77         InferenceEngine::DataConfig dataConfig;
78         dataConfig.inPlace = -1;
79         dataConfig.constant = false;
80         dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, memory::format::any));
81         config.inConfs.push_back(dataConfig);
82     }
83
84     auto dims = getChildEdgeAt(0)->getDims();
85
86     config.outConfs.resize(1);
87     config.outConfs[0].inPlace = -1;
88     config.outConfs[0].constant = false;
89     config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, MKLDNNMemory::GetPlainFormat(dims)));
90     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
91     if (dims.ndims() == 4) {
92         if (dims[1] % 8 == 0) {
93             config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw8c));
94             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
95
96             if (dims[1] % 16 == 0) {
97                 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw16c));
98                 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
99             }
100         }
101     } else if (dims.ndims() == 5) {
102         if (dims[1] % 8 == 0) {
103             config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
104             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
105
106             if (dims[1] % 16 == 0) {
107                 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
108                 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
109             }
110         }
111     }
112
113     if (axis != 1 || hasEltwise)
114         return;
115
116     auto numOfDim = static_cast<size_t>(dstDims.ndims());
117
118     SizeVector order(numOfDim);
119     SizeVector offsets(numOfDim, 0lu);
120     size_t offset = std::numeric_limits<size_t>::max();
121     for (size_t i = 0; i < numOfDim; i++) {
122         order[i] = i;
123     }
124
125     if (this->getCnnLayer()->precision == Precision::I8) {
126         if (numOfDim == 4) {
127             // Here we assume NHWC layout (channels are the last)
128
129             order = {0, 2, 3, 1};
130             offsets = {0, 0, 0, 0};
131
132             SizeVector blkDims = dstDims.ToSizeVector();
133             blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
134
135             SizeVector strides(numOfDim);
136             strides.resize(numOfDim);
137             // C is the last in NHWC, so all strides are max()
138             for (size_t i = 0; i < numOfDim; i++) {
139                 strides[i] = std::numeric_limits<size_t>::max();
140             }
141
142             config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(),
143                                                  dstDims.ToSizeVector(),
144                                                  { blkDims, order, offset, offsets, strides });
145             for (size_t i = 0; i < getParentEdges().size(); i++) {
146                 auto parentEdge = getParentEdgeAt(i);
147
148                 SizeVector blkDims = parentEdge->getDims().ToSizeVector();
149                 blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
150
151                 config.inConfs[i].inPlace = -1;     // Change to 0 here if inplace concat is supported for NHWC in mkldnn
152
153                 config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(),
154                                                     {blkDims, order, offset, offsets, strides});
155             }
156
157             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
158         }
159     } else {
160         SizeVector strides(numOfDim);
161         strides[numOfDim - 1] = 1;
162         for (size_t i = 2; i <= numOfDim; i++) {
163             if (numOfDim - i < axis) {
164                 strides[numOfDim - i] = std::numeric_limits<size_t>::max();
165             } else {
166                 strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1];
167             }
168         }
169
170         config.outConfs[0].desc = TensorDesc(
171                 MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
172                 dstDims.ToSizeVector(),
173                 {dstDims.ToSizeVector(), order, offset, offsets, strides});
174         for (size_t i = 0; i < getParentEdges().size(); i++) {
175             auto parentEdge = getParentEdgeAt(i);
176             config.inConfs[i].inPlace = 0;
177             config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
178                                                 {parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides});
179         }
180
181         supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
182
183         if (numOfDim == 4lu || numOfDim == 5lu) {
184             size_t blkDimsLen = numOfDim + 1;
185             order.resize(blkDimsLen);
186             for (size_t i = 0; i < numOfDim; i++) {
187                 order[i] = i;
188             }
189             order[numOfDim] = 1lu;
190             offsets = SizeVector(blkDimsLen, 0lu);
191
192             // nChw8c, nChw16c, nCdhw8c, nCdhw16c
193             for (size_t sizeS : {8lu, 16lu}) {
194                 SizeVector blkDims = dstDims.ToSizeVector();
195                 if (blkDims[1] % sizeS)
196                     continue;
197                 blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
198                 blkDims.push_back(sizeS);
199
200                 strides.resize(blkDimsLen);
201                 strides[blkDimsLen - 1] = 1;
202                 for (size_t i = 2lu; i <= blkDimsLen; i++) {
203                     if (blkDimsLen - i < axis) {
204                         strides[blkDimsLen - i] = std::numeric_limits<size_t>::max();
205                     } else {
206                         strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
207                     }
208                 }
209                 config.outConfs[0].desc = TensorDesc(
210                         MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
211                         dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
212
213                 bool canInplace = true;
214                 for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
215                     auto parentEdge = getParentEdgeAt(i);
216                     blkDims = parentEdge->getDims().ToSizeVector();
217                     if (blkDims[1] % sizeS)
218                         canInplace = false;
219
220                     blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
221                     blkDims.push_back(sizeS);
222                     config.inConfs[i].desc =  TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
223                                                          {blkDims, order, offset, offsets, strides});
224                 }
225                 if (canInplace)
226                     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
227             }
228         }
229     }
230 }
231
232 void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
233     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
234     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
235     precision = getCnnLayer()->outData[0]->getPrecision();
236     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
237
238     bool hasUnknown = false;
239     std::vector<size_t> canSelectPrimitive;
240     for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
241         bool hasAny = true;
242         auto &primDescInfo = supportedPrimitiveDescriptors[i];
243         if (primDescInfo.getImplementationType() != impl_desc_type::unknown ||
244                 primDescInfo.getConfig().inConfs[0].inPlace < 0)
245             continue;
246         hasUnknown = true;
247         for (auto iInfo : primDescInfo.getConfig().inConfs) {
248             if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
249                 hasAny = false;
250                 break;
251             }
252         }
253
254         if (hasAny) {
255             for (auto oInfo : primDescInfo.getConfig().outConfs) {
256                 if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
257                     hasAny = false;
258                     break;
259                 }
260             }
261         }
262
263         if (!hasAny) {
264             canSelectPrimitive.push_back(i);
265         }
266     }
267
268     bool hasDoubleConnection = false;
269     for (int i = 0; i < getParentEdges().size(); i++) {
270         for (int j = i + 1; j < getParentEdges().size(); j++) {
271             if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true;
272         }
273     }
274
275     if (hasDoubleConnection) {
276         // The double connection marks that some tensor should
277         // be replicated. Inplace approach is not applicable
278         // for that case. Descriptor with index 0 is pure copy
279         // implementation
280         selectPrimitiveDescriptorByIndex(0);
281         return;
282     }
283
284     bool canOptimize = true;
285     for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
286         const auto& parent = getParentEdgeAt(i)->getParent();
287         for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) {
288             const auto& child = parent->getChildEdgeAt(j)->getChild();
289             const auto* childConcat = dynamic_cast<MKLDNNConcatNode *>(child.get());
290             if (!childConcat || childConcat == this)
291                 continue;
292             if (childConcat->isOptimized())
293                 canOptimize = false;
294         }
295     }
296     if (hasUnknown && axis == 1) {
297         if (canSelectPrimitive.size() == 1) {
298             selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
299             return;
300         }
301     } else {
302         canOptimize = false;
303     }
304
305     std::map<mkldnn::memory::format, size_t> formatFrequency;
306     for (size_t i = 0; i < getParentEdges().size(); i++) {
307         auto parentEdge = getParentEdgeAt(i);
308         auto parent = parentEdge->getParent();
309
310         if (parent->getSelectedPrimitiveDescriptor() == nullptr)
311             continue;
312
313         int outputIndex = parentEdge->getOutputNum();
314         if (outputIndex < 0)
315             THROW_IE_EXCEPTION << "Cannot find index of output node";
316         if (outputIndex >= parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size())
317             outputIndex = 0;
318         auto outDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[outputIndex].desc);
319         if (!outDesc)
320             continue;
321         if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
322             formatFrequency[outDesc.getFormat()] += 1;
323         else
324             formatFrequency[outDesc.getFormat()] = 1;
325     }
326     for (size_t i = 0; i < getChildEdges().size(); i++) {
327         auto childEdge = getChildEdgeAt(i);
328         auto child = childEdge->getChild();
329         if (child->getSelectedPrimitiveDescriptor() == nullptr)
330             continue;
331         int inputIndex = childEdge->getOutputNum();
332         if (inputIndex < 0)
333             THROW_IE_EXCEPTION << "Cannot find index of output node";
334         if (inputIndex >= child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size())
335             inputIndex = 0;
336         auto outDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[inputIndex].desc);
337         if (!outDesc)
338             continue;
339         if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
340             formatFrequency[outDesc.getFormat()] += 1;
341         else
342             formatFrequency[outDesc.getFormat()] = 1;
343     }
344
345     size_t maxCount = 0;
346     mkldnn::memory::format convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
347     for (auto &it : formatFrequency) {
348         if (it.second > maxCount) {
349             maxCount = it.second;
350             convertTo = it.first;
351         }
352     }
353
354     if (canOptimize && MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, convertTo).blocksExtended())
355         convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
356     for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
357         if (MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, convertTo).blocksExtended())
358             convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
359     }
360
361     for (auto supportedPdIndex : canSelectPrimitive) {
362         if (MKLDNNMemoryDesc(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc).getFormat() == convertTo) {
363             selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
364             return;
365         }
366     }
367
368     for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
369         auto &primDescInfo = supportedPrimitiveDescriptors[i];
370         if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
371             continue;
372         if (convertTo == MKLDNNMemoryDesc(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc).getFormat()) {
373             size_t num = 0;
374             for (num = 0; num < getParentEdges().size(); num++) {
375                 if (MKLDNNMemoryDesc(getParentEdgeAt(num)->getDims(), inputDataType, convertTo).blocksExtended())
376                     break;
377             }
378             if (num == getParentEdges().size()) {
379                 selectPrimitiveDescriptorByIndex(i);
380                 return;
381             }
382         }
383     }
384     selectPrimitiveDescriptorByIndex(0);
385 }
386
387 bool MKLDNNConcatNode::created() const {
388     return getType() == Concatenation;
389 }
390
391 bool MKLDNNConcatNode::isOptimized() const {
392     return getSelectedPrimitiveDescriptor() && getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].inPlace >= 0;
393 }
394
395 void MKLDNNConcatNode::createPrimitive() {
396     if (prim || isOptimized())
397         return;
398
399     auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
400     if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
401         THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
402     if (getSelectedPrimitiveDescriptor() == nullptr)
403         THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
404
405     std::vector<memory::primitive_desc> srcs_pd;
406     std::vector<primitive::at> srcs_p;
407
408     for (size_t i = 0; i < getParentEdges().size(); i++) {
409         auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
410         if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) {
411             auto parent = getParentEdgeAt(i)->getParent();
412             THROW_IE_EXCEPTION << "Source memory from " << parent->getName() << " didn't allocate for node "
413                                << getName() << ".";
414         }
415
416         auto desc = srcMemPtr->GetDescriptor();
417         auto dims = getParentEdgeAt(i)->getDims();
418         for (size_t j = 0; j < dims.ndims(); j++) {
419             desc.data.dims[j] = dims[j];
420         }
421
422         srcs_pd.emplace_back(desc, srcMemPtr->GetPrimitiveDescriptor().get_engine());
423         srcs_p.emplace_back(srcMemPtr->GetPrimitive());
424     }
425
426     auto desc = getChildEdgeAt(0)->getMemory().GetDescriptor();
427     auto dims = getChildEdgeAt(0)->getDims();
428     for (size_t i = 0; i < dims.ndims(); i++) {
429         desc.data.dims[i] = dims[i];
430         desc.data.layout_desc.blocking.padding_dims[i] = dims[i];
431     }
432
433     auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_pd);
434
435     prim.reset(new concat(primitive_desc, srcs_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
436 }
437
438 size_t MKLDNNConcatNode::inverseOrder(const SizeVector& order, size_t axis) {
439     for (size_t i = 0; i < order.size(); i++) {
440         if (axis == order[i]) {
441             return i;
442         }
443     }
444     return -1;
445 }
446
447 void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
448     if (!isOptimized()) {
449         MKLDNNNode::initOptimalPrimitiveDescriptor();
450         return;
451     }
452
453     auto config = getSelectedPrimitiveDescriptor()->getConfig();
454     if (isInitConfig(config))
455         return;
456
457     for (size_t i = 0; i < config.outConfs.size(); i++) {
458         if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY ||
459                 !isUninitTensorDesc(config.outConfs[i].desc))
460             continue;
461
462         int num = getChildEdgeAt(i)->getOutputNum();
463         if (num >= 0) {
464             auto childConf = getChildEdgeAt(i)->getChild()->getSelectedPrimitiveDescriptor()->getConfig().inConfs[num];
465             childConf.desc.setPrecision(config.outConfs[i].desc.getPrecision());
466
467             if (getChildEdgeAt(i)->getChild()->getSelectedPrimitiveDescriptor()) {
468                 if (isUninitTensorDesc(childConf.desc) && childConf.inPlace >= 0)
469                     getChildEdgeAt(i)->getChild()->initOptimalPrimitiveDescriptor();
470
471                 if (!isUninitTensorDesc(childConf.desc) &&
472                         MKLDNNExtensionUtils::initTensorsAreEqual(childConf.desc, config.outConfs[i].desc)) {
473                     config.outConfs[i].desc = childConf.desc;
474                     continue;
475                 }
476             }
477         }
478         config.outConfs[i].desc = InferenceEngine::TensorDesc(config.outConfs[i].desc.getPrecision(),
479                                                               config.outConfs[i].desc.getDims(), {
480                                                                       config.outConfs[i].desc.getBlockingDesc().getBlockDims(),
481                                                                       config.outConfs[i].desc.getBlockingDesc().getOrder()
482                                                               });
483     }
484     size_t offset = 0;
485     for (size_t i = 0; i < config.inConfs.size(); i++) {
486         config.inConfs[i].desc = InferenceEngine::TensorDesc(config.inConfs[i].desc.getPrecision(),
487                                                              config.inConfs[i].desc.getDims(), {
488                                                                   config.inConfs[i].desc.getBlockingDesc().getBlockDims(),
489                                                                   config.inConfs[i].desc.getBlockingDesc().getOrder(),
490                                                                   config.outConfs[0].desc.getBlockingDesc().getOffsetPadding() + offset,
491                                                                   config.outConfs[0].desc.getBlockingDesc().getOffsetPaddingToData(),
492                                                                   config.outConfs[0].desc.getBlockingDesc().getStrides()
493                                                              });
494         size_t axisSize = 1;
495
496         if (config.inConfs[0].desc.getLayout() == Layout::NHWC) {
497             // This is more general and works for any "direct" Layout (such as nchw or nhwc), but it doesn't work for nchw8c
498             size_t realAxis = inverseOrder(config.inConfs[0].desc.getBlockingDesc().getOrder(), axis);
499             for (size_t j = realAxis; j < config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); j++) {
500                 size_t jj = config.inConfs[0].desc.getBlockingDesc().getOrder()[j];
501                 axisSize *= config.inConfs[i].desc.getBlockingDesc().getBlockDims()[jj];
502             }
503         } else {
504             // This works for nchw and nchw8c/nchw16c
505             for (size_t j = axis; j < config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); j++) {
506                 axisSize *= config.inConfs[i].desc.getBlockingDesc().getBlockDims()[j];
507             }
508         }
509         offset += axisSize;
510     }
511     initDescriptor(config);
512 }
513
514 void MKLDNNConcatNode::execute(mkldnn::stream strm) {
515     if (isOptimized()) {
516         return;
517     }
518
519     const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
520     const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
521
522     const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
523
524     if (isInt8) {
525         uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
526
527         const size_t num_src = getParentEdges().size();
528
529         std::vector<size_t> channels;
530         size_t channels_size = 0;
531         std::vector<const uint8_t*> src_ptrs;
532         std::vector<uint8_t*> dst_ptrs;
533
534         for (size_t i = 0; i < num_src; i++) {
535             const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
536             const size_t num_channels = src_mem.GetDims()[1];
537
538             channels.push_back(num_channels);
539             src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
540             dst_ptrs.push_back(dst_ptr + channels_size);
541             channels_size += num_channels;
542         }
543
544         const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0];
545
546         parallel_for(iter_count, [&](int i) {
547             const size_t dst_off = i * channels_size;
548             for (int j = 0; j < num_src; j++) {
549                 memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]);
550             }
551         });
552     } else {
553         MKLDNNNode::execute(strm);
554     }
555 }