inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "mkldnn_concat_node.h"
   6
   7 #include <map>
   8 #include <utility>
   9 #include <vector>
  10 #include <mkldnn_extension_utils.h>
  11
  12 #include "details/ie_exception.hpp"
  13 #include "ie_layers.h"
  14 #include "mkldnn.hpp"
  15 #include "mkldnn/iml_type_mapper.h"
  16 #include "mkldnn_dims.h"
  17 #include "mkldnn_edge.h"
  18 #include "mkldnn_memory.h"
  19 #include "ie_parallel.hpp"
  20 #include <limits>
  21
  22 using namespace mkldnn;
  23 using namespace MKLDNNPlugin;
  24 using namespace InferenceEngine;
  25
  26 MKLDNNConcatNode::MKLDNNConcatNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
  27
  28 void MKLDNNConcatNode::getSupportedDescriptors() {
  29     auto * conLayer = dynamic_cast<ConcatLayer*>(getCnnLayer().get());
  30
  31     if (conLayer == nullptr)
  32         THROW_IE_EXCEPTION << "Cannot convert concat layer.";
  33
  34     axis = conLayer->_axis;
  35
  36     if (getParentEdges().empty())
  37         THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
  38     if (getChildEdges().empty())
  39         THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
  40     auto& firstParentDims = getParentEdgeAt(0)->getDims();
  41     for (size_t i = 1; i < getParentEdges().size(); i++) {
  42         auto& dims = getParentEdgeAt(i)->getDims();
  43         bool incorrectDims = false;
  44         for (size_t j = 0; j < firstParentDims.ndims(); j++) {
  45             if (j == axis)
  46                 continue;
  47             if (dims.ndims() != firstParentDims.ndims() || firstParentDims[j] != dims[j]) {
  48                 incorrectDims = true;
  49                 break;
  50             }
  51         }
  52         if (incorrectDims || firstParentDims.ndims() == 0) {
  53             THROW_IE_EXCEPTION << "Incorrect input dimensions for concat node " << getName();
  54         }
  55     }
  56 }
  57
  58 void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
  59     if (!supportedPrimitiveDescriptors.empty())
  60         return;
  61
  62     InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
  63     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision);
  64     InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision();
  65     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
  66
  67     MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
  68     InferenceEngine::LayerConfig config;
  69     config.dynBatchSupport = true;
  70     bool hasEltwise = false;
  71
  72     for (size_t i = 0; i < getParentEdges().size(); i++) {
  73         auto parentEdge = getParentEdgeAt(i);
  74         if (parentEdge->getParent()->getType() == Eltwise)
  75             hasEltwise = true;
  76
  77         InferenceEngine::DataConfig dataConfig;
  78         dataConfig.inPlace = -1;
  79         dataConfig.constant = false;
  80         dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, memory::format::any));
  81         config.inConfs.push_back(dataConfig);
  82     }
  83
  84     auto dims = getChildEdgeAt(0)->getDims();
  85
  86     config.outConfs.resize(1);
  87     config.outConfs[0].inPlace = -1;
  88     config.outConfs[0].constant = false;
  89     config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, MKLDNNMemory::GetPlainFormat(dims)));
  90     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
  91     if (dims.ndims() == 4) {
  92         if (dims[1] % 8 == 0) {
  93             config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw8c));
  94             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
  95
  96             if (dims[1] % 16 == 0) {
  97                 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw16c));
  98                 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
  99             }
 100         }
 101     } else if (dims.ndims() == 5) {
 102         if (dims[1] % 8 == 0) {
 103             config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
 104             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
 105
 106             if (dims[1] % 16 == 0) {
 107                 config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
 108                 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
 109             }
 110         }
 111     }
 112
 113     if (axis != 1 || hasEltwise)
 114         return;
 115
 116     auto numOfDim = static_cast<size_t>(dstDims.ndims());
 117
 118     SizeVector order(numOfDim);
 119     SizeVector offsets(numOfDim, 0lu);
 120     size_t offset = std::numeric_limits<size_t>::max();
 121     for (size_t i = 0; i < numOfDim; i++) {
 122         order[i] = i;
 123     }
 124
 125     if (this->getCnnLayer()->precision == Precision::I8) {
 126         if (numOfDim == 4) {
 127             // Here we assume NHWC layout (channels are the last)
 128
 129             order = {0, 2, 3, 1};
 130             offsets = {0, 0, 0, 0};
 131
 132             SizeVector blkDims = dstDims.ToSizeVector();
 133             blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
 134
 135             SizeVector strides(numOfDim);
 136             strides.resize(numOfDim);
 137             // C is the last in NHWC, so all strides are max()
 138             for (size_t i = 0; i < numOfDim; i++) {
 139                 strides[i] = std::numeric_limits<size_t>::max();
 140             }
 141
 142             config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(),
 143                                                  dstDims.ToSizeVector(),
 144                                                  { blkDims, order, offset, offsets, strides });
 145             for (size_t i = 0; i < getParentEdges().size(); i++) {
 146                 auto parentEdge = getParentEdgeAt(i);
 147
 148                 SizeVector blkDims = parentEdge->getDims().ToSizeVector();
 149                 blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
 150
 151                 config.inConfs[i].inPlace = -1;     // Change to 0 here if inplace concat is supported for NHWC in mkldnn
 152
 153                 config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(),
 154                                                     {blkDims, order, offset, offsets, strides});
 155             }
 156
 157             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
 158         }
 159     } else {
 160         SizeVector strides(numOfDim);
 161         strides[numOfDim - 1] = 1;
 162         for (size_t i = 2; i <= numOfDim; i++) {
 163             if (numOfDim - i < axis) {
 164                 strides[numOfDim - i] = std::numeric_limits<size_t>::max();
 165             } else {
 166                 strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1];
 167             }
 168         }
 169
 170         config.outConfs[0].desc = TensorDesc(
 171                 MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
 172                 dstDims.ToSizeVector(),
 173                 {dstDims.ToSizeVector(), order, offset, offsets, strides});
 174         for (size_t i = 0; i < getParentEdges().size(); i++) {
 175             auto parentEdge = getParentEdgeAt(i);
 176             config.inConfs[i].inPlace = 0;
 177             config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
 178                                                 {parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides});
 179         }
 180
 181         supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 182
 183         if (numOfDim == 4lu || numOfDim == 5lu) {
 184             size_t blkDimsLen = numOfDim + 1;
 185             order.resize(blkDimsLen);
 186             for (size_t i = 0; i < numOfDim; i++) {
 187                 order[i] = i;
 188             }
 189             order[numOfDim] = 1lu;
 190             offsets = SizeVector(blkDimsLen, 0lu);
 191
 192             // nChw8c, nChw16c, nCdhw8c, nCdhw16c
 193             for (size_t sizeS : {8lu, 16lu}) {
 194                 SizeVector blkDims = dstDims.ToSizeVector();
 195                 if (blkDims[1] % sizeS)
 196                     continue;
 197                 blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
 198                 blkDims.push_back(sizeS);
 199
 200                 strides.resize(blkDimsLen);
 201                 strides[blkDimsLen - 1] = 1;
 202                 for (size_t i = 2lu; i <= blkDimsLen; i++) {
 203                     if (blkDimsLen - i < axis) {
 204                         strides[blkDimsLen - i] = std::numeric_limits<size_t>::max();
 205                     } else {
 206                         strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
 207                     }
 208                 }
 209                 config.outConfs[0].desc = TensorDesc(
 210                         MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
 211                         dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
 212
 213                 bool canInplace = true;
 214                 for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
 215                     auto parentEdge = getParentEdgeAt(i);
 216                     blkDims = parentEdge->getDims().ToSizeVector();
 217                     if (blkDims[1] % sizeS)
 218                         canInplace = false;
 219
 220                     blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
 221                     blkDims.push_back(sizeS);
 222                     config.inConfs[i].desc =  TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
 223                                                          {blkDims, order, offset, offsets, strides});
 224                 }
 225                 if (canInplace)
 226                     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 227             }
 228         }
 229     }
 230 }
 231
 232 void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
 233     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
 234     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 235     precision = getCnnLayer()->outData[0]->getPrecision();
 236     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 237
 238     bool hasUnknown = false;
 239     std::vector<size_t> canSelectPrimitive;
 240     for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
 241         bool hasAny = true;
 242         auto &primDescInfo = supportedPrimitiveDescriptors[i];
 243         if (primDescInfo.getImplementationType() != impl_desc_type::unknown ||
 244                 primDescInfo.getConfig().inConfs[0].inPlace < 0)
 245             continue;
 246         hasUnknown = true;
 247         for (auto iInfo : primDescInfo.getConfig().inConfs) {
 248             if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
 249                 hasAny = false;
 250                 break;
 251             }
 252         }
 253
 254         if (hasAny) {
 255             for (auto oInfo : primDescInfo.getConfig().outConfs) {
 256                 if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
 257                     hasAny = false;
 258                     break;
 259                 }
 260             }
 261         }
 262
 263         if (!hasAny) {
 264             canSelectPrimitive.push_back(i);
 265         }
 266     }
 267
 268     bool hasDoubleConnection = false;
 269     for (int i = 0; i < getParentEdges().size(); i++) {
 270         for (int j = i + 1; j < getParentEdges().size(); j++) {
 271             if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true;
 272         }
 273     }
 274
 275     if (hasDoubleConnection) {
 276         // The double connection marks that some tensor should
 277         // be replicated. Inplace approach is not applicable
 278         // for that case. Descriptor with index 0 is pure copy
 279         // implementation
 280         selectPrimitiveDescriptorByIndex(0);
 281         return;
 282     }
 283
 284     bool canOptimize = true;
 285     for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
 286         const auto& parent = getParentEdgeAt(i)->getParent();
 287         for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) {
 288             const auto& child = parent->getChildEdgeAt(j)->getChild();
 289             const auto* childConcat = dynamic_cast<MKLDNNConcatNode *>(child.get());
 290             if (!childConcat || childConcat == this)
 291                 continue;
 292             if (childConcat->isOptimized())
 293                 canOptimize = false;
 294         }
 295     }
 296     if (hasUnknown && axis == 1) {
 297         if (canSelectPrimitive.size() == 1) {
 298             selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
 299             return;
 300         }
 301     } else {
 302         canOptimize = false;
 303     }
 304
 305     std::map<mkldnn::memory::format, size_t> formatFrequency;
 306     for (size_t i = 0; i < getParentEdges().size(); i++) {
 307         auto parentEdge = getParentEdgeAt(i);
 308         auto parent = parentEdge->getParent();
 309
 310         if (parent->getSelectedPrimitiveDescriptor() == nullptr)
 311             continue;
 312
 313         int outputIndex = parentEdge->getOutputNum();
 314         if (outputIndex < 0)
 315             THROW_IE_EXCEPTION << "Cannot find index of output node";
 316         if (outputIndex >= parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size())
 317             outputIndex = 0;
 318         auto outDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[outputIndex].desc);
 319         if (!outDesc)
 320             continue;
 321         if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
 322             formatFrequency[outDesc.getFormat()] += 1;
 323         else
 324             formatFrequency[outDesc.getFormat()] = 1;
 325     }
 326     for (size_t i = 0; i < getChildEdges().size(); i++) {
 327         auto childEdge = getChildEdgeAt(i);
 328         auto child = childEdge->getChild();
 329         if (child->getSelectedPrimitiveDescriptor() == nullptr)
 330             continue;
 331         int inputIndex = childEdge->getOutputNum();
 332         if (inputIndex < 0)
 333             THROW_IE_EXCEPTION << "Cannot find index of output node";
 334         if (inputIndex >= child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size())
 335             inputIndex = 0;
 336         auto outDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[inputIndex].desc);
 337         if (!outDesc)
 338             continue;
 339         if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
 340             formatFrequency[outDesc.getFormat()] += 1;
 341         else
 342             formatFrequency[outDesc.getFormat()] = 1;
 343     }
 344
 345     size_t maxCount = 0;
 346     mkldnn::memory::format convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
 347     for (auto &it : formatFrequency) {
 348         if (it.second > maxCount) {
 349             maxCount = it.second;
 350             convertTo = it.first;
 351         }
 352     }
 353
 354     if (canOptimize && MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, convertTo).blocksExtended())
 355         convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
 356     for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
 357         if (MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, convertTo).blocksExtended())
 358             convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
 359     }
 360
 361     for (auto supportedPdIndex : canSelectPrimitive) {
 362         if (MKLDNNMemoryDesc(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc).getFormat() == convertTo) {
 363             selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
 364             return;
 365         }
 366     }
 367
 368     for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
 369         auto &primDescInfo = supportedPrimitiveDescriptors[i];
 370         if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
 371             continue;
 372         if (convertTo == MKLDNNMemoryDesc(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc).getFormat()) {
 373             size_t num = 0;
 374             for (num = 0; num < getParentEdges().size(); num++) {
 375                 if (MKLDNNMemoryDesc(getParentEdgeAt(num)->getDims(), inputDataType, convertTo).blocksExtended())
 376                     break;
 377             }
 378             if (num == getParentEdges().size()) {
 379                 selectPrimitiveDescriptorByIndex(i);
 380                 return;
 381             }
 382         }
 383     }
 384     selectPrimitiveDescriptorByIndex(0);
 385 }
 386
 387 bool MKLDNNConcatNode::created() const {
 388     return getType() == Concatenation;
 389 }
 390
 391 bool MKLDNNConcatNode::isOptimized() const {
 392     return getSelectedPrimitiveDescriptor() && getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].inPlace >= 0;
 393 }
 394
 395 void MKLDNNConcatNode::createPrimitive() {
 396     if (prim || isOptimized())
 397         return;
 398
 399     auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
 400     if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
 401         THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
 402     if (getSelectedPrimitiveDescriptor() == nullptr)
 403         THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
 404
 405     std::vector<memory::primitive_desc> srcs_pd;
 406     std::vector<primitive::at> srcs_p;
 407
 408     for (size_t i = 0; i < getParentEdges().size(); i++) {
 409         auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
 410         if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) {
 411             auto parent = getParentEdgeAt(i)->getParent();
 412             THROW_IE_EXCEPTION << "Source memory from " << parent->getName() << " didn't allocate for node "
 413                                << getName() << ".";
 414         }
 415
 416         auto desc = srcMemPtr->GetDescriptor();
 417         auto dims = getParentEdgeAt(i)->getDims();
 418         for (size_t j = 0; j < dims.ndims(); j++) {
 419             desc.data.dims[j] = dims[j];
 420         }
 421
 422         srcs_pd.emplace_back(desc, srcMemPtr->GetPrimitiveDescriptor().get_engine());
 423         srcs_p.emplace_back(srcMemPtr->GetPrimitive());
 424     }
 425
 426     auto desc = getChildEdgeAt(0)->getMemory().GetDescriptor();
 427     auto dims = getChildEdgeAt(0)->getDims();
 428     for (size_t i = 0; i < dims.ndims(); i++) {
 429         desc.data.dims[i] = dims[i];
 430         desc.data.layout_desc.blocking.padding_dims[i] = dims[i];
 431     }
 432
 433     auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_pd);
 434
 435     prim.reset(new concat(primitive_desc, srcs_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
 436 }
 437
 438 size_t MKLDNNConcatNode::inverseOrder(const SizeVector& order, size_t axis) {
 439     for (size_t i = 0; i < order.size(); i++) {
 440         if (axis == order[i]) {
 441             return i;
 442         }
 443     }
 444     return -1;
 445 }
 446
 447 void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
 448     if (!isOptimized()) {
 449         MKLDNNNode::initOptimalPrimitiveDescriptor();
 450         return;
 451     }
 452
 453     auto config = getSelectedPrimitiveDescriptor()->getConfig();
 454     if (isInitConfig(config))
 455         return;
 456
 457     for (size_t i = 0; i < config.outConfs.size(); i++) {
 458         if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY ||
 459                 !isUninitTensorDesc(config.outConfs[i].desc))
 460             continue;
 461
 462         int num = getChildEdgeAt(i)->getOutputNum();
 463         if (num >= 0) {
 464             auto childConf = getChildEdgeAt(i)->getChild()->getSelectedPrimitiveDescriptor()->getConfig().inConfs[num];
 465             childConf.desc.setPrecision(config.outConfs[i].desc.getPrecision());
 466
 467             if (getChildEdgeAt(i)->getChild()->getSelectedPrimitiveDescriptor()) {
 468                 if (isUninitTensorDesc(childConf.desc) && childConf.inPlace >= 0)
 469                     getChildEdgeAt(i)->getChild()->initOptimalPrimitiveDescriptor();
 470
 471                 if (!isUninitTensorDesc(childConf.desc) &&
 472                         MKLDNNExtensionUtils::initTensorsAreEqual(childConf.desc, config.outConfs[i].desc)) {
 473                     config.outConfs[i].desc = childConf.desc;
 474                     continue;
 475                 }
 476             }
 477         }
 478         config.outConfs[i].desc = InferenceEngine::TensorDesc(config.outConfs[i].desc.getPrecision(),
 479                                                               config.outConfs[i].desc.getDims(), {
 480                                                                       config.outConfs[i].desc.getBlockingDesc().getBlockDims(),
 481                                                                       config.outConfs[i].desc.getBlockingDesc().getOrder()
 482                                                               });
 483     }
 484     size_t offset = 0;
 485     for (size_t i = 0; i < config.inConfs.size(); i++) {
 486         config.inConfs[i].desc = InferenceEngine::TensorDesc(config.inConfs[i].desc.getPrecision(),
 487                                                              config.inConfs[i].desc.getDims(), {
 488                                                                   config.inConfs[i].desc.getBlockingDesc().getBlockDims(),
 489                                                                   config.inConfs[i].desc.getBlockingDesc().getOrder(),
 490                                                                   config.outConfs[0].desc.getBlockingDesc().getOffsetPadding() + offset,
 491                                                                   config.outConfs[0].desc.getBlockingDesc().getOffsetPaddingToData(),
 492                                                                   config.outConfs[0].desc.getBlockingDesc().getStrides()
 493                                                              });
 494         size_t axisSize = 1;
 495
 496         if (config.inConfs[0].desc.getLayout() == Layout::NHWC) {
 497             // This is more general and works for any "direct" Layout (such as nchw or nhwc), but it doesn't work for nchw8c
 498             size_t realAxis = inverseOrder(config.inConfs[0].desc.getBlockingDesc().getOrder(), axis);
 499             for (size_t j = realAxis; j < config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); j++) {
 500                 size_t jj = config.inConfs[0].desc.getBlockingDesc().getOrder()[j];
 501                 axisSize *= config.inConfs[i].desc.getBlockingDesc().getBlockDims()[jj];
 502             }
 503         } else {
 504             // This works for nchw and nchw8c/nchw16c
 505             for (size_t j = axis; j < config.inConfs[i].desc.getBlockingDesc().getBlockDims().size(); j++) {
 506                 axisSize *= config.inConfs[i].desc.getBlockingDesc().getBlockDims()[j];
 507             }
 508         }
 509         offset += axisSize;
 510     }
 511     initDescriptor(config);
 512 }
 513
 514 void MKLDNNConcatNode::execute(mkldnn::stream strm) {
 515     if (isOptimized()) {
 516         return;
 517     }
 518
 519     const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
 520     const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
 521
 522     const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
 523
 524     if (isInt8) {
 525         uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
 526
 527         const size_t num_src = getParentEdges().size();
 528
 529         std::vector<size_t> channels;
 530         size_t channels_size = 0;
 531         std::vector<const uint8_t*> src_ptrs;
 532         std::vector<uint8_t*> dst_ptrs;
 533
 534         for (size_t i = 0; i < num_src; i++) {
 535             const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
 536             const size_t num_channels = src_mem.GetDims()[1];
 537
 538             channels.push_back(num_channels);
 539             src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
 540             dst_ptrs.push_back(dst_ptr + channels_size);
 541             channels_size += num_channels;
 542         }
 543
 544         const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0];
 545
 546         parallel_for(iter_count, [&](int i) {
 547             const size_t dst_off = i * channels_size;
 548             for (int j = 0; j < num_src; j++) {
 549                 memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]);
 550             }
 551         });
 552     } else {
 553         MKLDNNNode::execute(strm);
 554     }
 555 }