From 747ef82c88f9afe14a8b80b6b3b34118353e97f2 Mon Sep 17 00:00:00 2001 From: Matteo Martincigh Date: Tue, 18 Dec 2018 09:26:39 +0000 Subject: [PATCH] MLCE-77 Depthwise Convolution with depth multiplier > 1 doesn't work * Unified ArmNN's weight format to [ M, I, H, W ] for the depthwise convolution * Added conversion utilities to permute/reshape the weights as appropriate when using CL and Neon backends * Updated the reference implementation of the convolution * Updated the relevant unit tests accordingly !android-nn-driver:459 Change-Id: I07d0818efa9d1ca1e5dad82983aac1fe78eadb18 --- src/armnn/layers/DepthwiseConvolution2dLayer.cpp | 40 ++++---- src/armnn/test/CreateWorkload.hpp | 20 ++-- src/armnn/test/OptimizerTests.cpp | 2 +- src/armnnTfLiteParser/TfLiteParser.cpp | 105 +++++++++++++------ src/armnnTfLiteParser/TfLiteParser.hpp | 32 ++++-- src/armnnTfParser/TfParser.cpp | 16 ++- src/armnnUtils/ParserPrototxtFixture.hpp | 2 - src/armnnUtils/Permute.cpp | 57 +++++++++++ src/armnnUtils/Permute.hpp | 5 +- src/backends/aclCommon/ArmComputeTensorUtils.cpp | 26 ++--- src/backends/aclCommon/ArmComputeTensorUtils.hpp | 8 +- src/backends/backendsCommon/CMakeLists.txt | 1 + src/backends/backendsCommon/CpuTensorHandle.cpp | 4 +- src/backends/backendsCommon/CpuTensorHandle.hpp | 6 ++ src/backends/backendsCommon/WorkloadData.cpp | 5 +- src/backends/backendsCommon/WorkloadUtils.cpp | 111 +++++++++++++++++++++ src/backends/backendsCommon/WorkloadUtils.hpp | 41 ++++++-- src/backends/backendsCommon/common.mk | 3 +- .../backendsCommon/test/Conv2dTestImpl.hpp | 64 +++++------- src/backends/backendsCommon/test/LayerTests.cpp | 30 ++---- .../workloads/ClDepthwiseConvolutionWorkload.cpp | 49 ++++++--- .../workloads/NeonDepthwiseConvolutionWorkload.cpp | 72 +++++++------ src/backends/reference/workloads/ConvImpl.hpp | 93 ++++++++--------- .../workloads/RefConvolution2dFloat32Workload.cpp | 8 +- .../workloads/RefConvolution2dUint8Workload.cpp | 7 +- .../RefDepthwiseConvolution2dFloat32Workload.cpp | 6 +- .../RefDepthwiseConvolution2dUint8Workload.cpp | 7 +- 27 files changed, 529 insertions(+), 291 deletions(-) create mode 100644 src/backends/backendsCommon/WorkloadUtils.cpp diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp index 95d4690..c4edc20 100644 --- a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp +++ b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp @@ -24,7 +24,7 @@ DepthwiseConvolution2dLayer::DepthwiseConvolution2dLayer(const DepthwiseConvolut { } -std::unique_ptr DepthwiseConvolution2dLayer::CreateWorkload(const Graph& graph, +std::unique_ptr DepthwiseConvolution2dLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { // on this level constant data should not be released.. @@ -59,34 +59,40 @@ std::vector DepthwiseConvolution2dLayer::InferOutputShapes(const std::vector& inputShapes) const { BOOST_ASSERT(inputShapes.size() == 2); - const TensorShape& inputShape = inputShapes[0]; - const TensorShape filterShape = inputShapes[1]; + const TensorShape& inputShape = inputShapes[0]; + const TensorShape& filterShape = inputShapes[1]; BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Convolutions will always have 4D input."); DataLayoutIndexed dataLayoutIndex(m_Param.m_DataLayout); - unsigned int inWidth = inputShape[dataLayoutIndex.GetWidthIndex()]; - unsigned int inHeight = inputShape[dataLayoutIndex.GetHeightIndex()]; - unsigned int inBatchSize = inputShape[0]; + unsigned int inputBatchSize = inputShape[0]; + unsigned int inputHeight = inputShape[dataLayoutIndex.GetHeightIndex()]; + unsigned int inputWidth = inputShape[dataLayoutIndex.GetWidthIndex()]; + unsigned int inputChannels = inputShape[dataLayoutIndex.GetChannelsIndex()]; - unsigned int filterWidth = filterShape[dataLayoutIndex.GetWidthIndex()]; - unsigned int readWidth = (inWidth + m_Param.m_PadLeft + m_Param.m_PadRight) - (filterWidth); - unsigned int outWidth = 1 + (readWidth / m_Param.m_StrideX); + // Expected filter shape: [ M, I, H, W ] - This shape does NOT depend on the data layout + // Namely: [ depth multiplier, input channels, filter height, filter width ] + // Output channels = input channels * depthMultiplier - unsigned int filterHeight = filterShape[dataLayoutIndex.GetHeightIndex()]; - unsigned int readHeight = (inHeight + m_Param.m_PadTop + m_Param.m_PadBottom) - (filterHeight); - unsigned int outHeight = 1 + (readHeight / m_Param.m_StrideY); unsigned int depthMultiplier = filterShape[0]; - unsigned int outChannels = filterShape[dataLayoutIndex.GetChannelsIndex()] * depthMultiplier; - unsigned int outBatchSize = inBatchSize; + unsigned int filterHeight = filterShape[2]; + unsigned int readHeight = (inputHeight + m_Param.m_PadTop + m_Param.m_PadBottom) - filterHeight; + unsigned int outputHeight = 1 + (readHeight / m_Param.m_StrideY); + + unsigned int filterWidth = filterShape[3]; + unsigned int readWidth = (inputWidth + m_Param.m_PadLeft + m_Param.m_PadRight) - filterWidth; + unsigned int outputWidth = 1 + (readWidth / m_Param.m_StrideX); + + unsigned int outputChannels = inputChannels * depthMultiplier; + unsigned int outputBatchSize = inputBatchSize; TensorShape tensorShape = m_Param.m_DataLayout == armnn::DataLayout::NHWC ? - TensorShape( { outBatchSize, outHeight, outWidth, outChannels } ) : - TensorShape( { outBatchSize, outChannels, outHeight, outWidth }); + TensorShape{ outputBatchSize, outputHeight, outputWidth, outputChannels } : + TensorShape{ outputBatchSize, outputChannels, outputHeight, outputWidth }; - return std::vector({ tensorShape }); + return std::vector{ tensorShape }; } void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp index 3dc18b9..f52f605 100644 --- a/src/armnn/test/CreateWorkload.hpp +++ b/src/armnn/test/CreateWorkload.hpp @@ -414,18 +414,18 @@ std::unique_ptr CreateDepthwiseConvolutio { // Creates the layer we're testing. DepthwiseConvolution2dDescriptor layerDesc; - layerDesc.m_PadLeft = 1; - layerDesc.m_PadRight = 2; - layerDesc.m_PadTop = 1; - layerDesc.m_PadBottom = 2; - layerDesc.m_StrideX = 1; - layerDesc.m_StrideY = 1; - layerDesc.m_BiasEnabled = false; - layerDesc.m_DataLayout = dataLayout; + layerDesc.m_PadLeft = 1; + layerDesc.m_PadRight = 2; + layerDesc.m_PadTop = 1; + layerDesc.m_PadBottom = 2; + layerDesc.m_StrideX = 1; + layerDesc.m_StrideY = 1; + layerDesc.m_BiasEnabled = false; + layerDesc.m_DataLayout = dataLayout; DepthwiseConvolution2dLayer* const layer = graph.AddLayer(layerDesc, "layer"); - layer->m_Weight = std::make_unique(TensorInfo({1, 4, 4, 2}, DataType)); + layer->m_Weight = std::make_unique(TensorInfo({1, 2, 4, 4}, DataType)); // [ M, I, H, W ] layer->m_Weight->Allocate(); // Creates extra layers. @@ -457,7 +457,7 @@ std::unique_ptr CreateDepthwiseConvolutio BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({1, 4, 4, 2}, DataType))); + BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({1, 2, 4, 4}, DataType))); // Returns so we can do extra, backend-specific tests. return workload; diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index 29d1702..80addb4 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -898,7 +898,7 @@ BOOST_AUTO_TEST_CASE(DepthwiseConv2dValidateTensorShapesFromInputsNhwc) { Graph graph; const unsigned int inputShape[] = { 1, 3, 3, 2 }; - const unsigned int weightsShape[] = { 1, 3, 3, 2 }; + const unsigned int weightsShape[] = { 1, 2, 3, 3 }; const unsigned int outputShape[] = { 1, 1, 1, 2 }; CreateDepthwiseConvolution2dGraph(graph, inputShape, weightsShape, outputShape, DataLayout::NHWC); diff --git a/src/armnnTfLiteParser/TfLiteParser.cpp b/src/armnnTfLiteParser/TfLiteParser.cpp index 49bc737..3b50476 100644 --- a/src/armnnTfLiteParser/TfLiteParser.cpp +++ b/src/armnnTfLiteParser/TfLiteParser.cpp @@ -401,7 +401,8 @@ template std::pair> CreateConstTensorImpl(TfLiteParser::BufferRawPtr bufferPtr, TfLiteParser::TensorRawPtr tensorPtr, - armnn::TensorInfo & tensorInfo) + armnn::TensorInfo& tensorInfo, + armnn::Optional permutationVector) { BOOST_ASSERT_MSG(tensorPtr != nullptr, "tensorPtr is null"); BOOST_ASSERT_MSG(bufferPtr != nullptr, @@ -409,7 +410,20 @@ CreateConstTensorImpl(TfLiteParser::BufferRawPtr bufferPtr, boost::format("Buffer for buffer:%1% is null") % tensorPtr->buffer).c_str()); std::unique_ptr data(new T[tensorInfo.GetNumElements()]); - ::memcpy(data.get(), bufferPtr->data.data(), tensorInfo.GetNumBytes()); + + if (permutationVector.has_value() && permutationVector.value().GetSize() > 0) + { + tensorInfo = armnnUtils::Permuted(tensorInfo, permutationVector.value()); + armnnUtils::Permute(tensorInfo.GetShape(), + permutationVector.value(), + reinterpret_cast(bufferPtr->data.data()), + data.get()); + } + else + { + ::memcpy(data.get(), bufferPtr->data.data(), tensorInfo.GetNumBytes()); + } + return std::make_pair(ConstTensor(tensorInfo, data.get()), std::move(data)); } @@ -660,7 +674,9 @@ void TfLiteParser::ParseConv2D(size_t subgraphIndex, size_t operatorIndex) CalcPadding(inputHeight, filterHeight, desc.m_StrideY, desc.m_PadTop, desc.m_PadBottom, options->padding); CalcPadding(inputWidth, filterWidth, desc.m_StrideX, desc.m_PadLeft, desc.m_PadRight, options->padding); - auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo); + auto filterTensorAndData = CreateConstTensor(inputs[1], + filterTensorInfo, + armnn::Optional()); armnn::IConnectableLayer* layer; auto layerName = boost::str(boost::format("Conv2D:%1%:%2%") % subgraphIndex % operatorIndex); @@ -669,7 +685,9 @@ void TfLiteParser::ParseConv2D(size_t subgraphIndex, size_t operatorIndex) { desc.m_BiasEnabled = true; armnn::TensorInfo biasTensorInfo = ToTensorInfo(inputs[2]); - auto biasTensorAndData = CreateConstTensor(inputs[2], biasTensorInfo); + auto biasTensorAndData = CreateConstTensor(inputs[2], + biasTensorInfo, + armnn::Optional()); layer = m_Network->AddConvolution2dLayer(desc, filterTensorAndData.first, biasTensorAndData.first, @@ -723,17 +741,27 @@ void TfLiteParser::ParseDepthwiseConv2D(size_t subgraphIndex, size_t operatorInd armnn::TensorInfo inputTensorInfo = ToTensorInfo(inputs[0]); armnn::TensorInfo filterTensorInfo = ToTensorInfo(inputs[1]); - // assuming input is NHWC + // Assuming input is NHWC unsigned int inputHeight = inputTensorInfo.GetShape()[1]; unsigned int inputWidth = inputTensorInfo.GetShape()[2]; - // assuming the filter is OHWI : Output, H, W, Input + + // TensorflowLite weights come in the format [1, H, W, I * M] unsigned int filterHeight = filterTensorInfo.GetShape()[1]; unsigned int filterWidth = filterTensorInfo.GetShape()[2]; + // Reshape weights as [ H, W, I, M ] + filterTensorInfo.SetShape({ filterHeight, + filterWidth, + inputTensorInfo.GetShape()[3], + filterTensorInfo.GetShape()[3] / inputTensorInfo.GetShape()[3] }); + + // Mappings from TensorflowLite filter tensors to the ArmNN filter tensors (ArmNN weights have to be [M, I, H, W]) + PermutationVector permutationVector{ 2, 3, 1, 0 }; // [H, W, I, M] -> [M, I, H, W] + CalcPadding(inputHeight, filterHeight, desc.m_StrideY, desc.m_PadTop, desc.m_PadBottom, options->padding); CalcPadding(inputWidth, filterWidth, desc.m_StrideX, desc.m_PadLeft, desc.m_PadRight, options->padding); - auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo); + auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo, permutationVector); armnn::IConnectableLayer* layer; auto layerName = boost::str(boost::format("DepthwiseConv2D:%1%:%2%") % subgraphIndex % operatorIndex); @@ -741,7 +769,9 @@ void TfLiteParser::ParseDepthwiseConv2D(size_t subgraphIndex, size_t operatorInd { desc.m_BiasEnabled = true; TensorInfo biasTensorInfo = ToTensorInfo(inputs[2]); - auto biasTensorAndData = CreateConstTensor(inputs[2], biasTensorInfo); + auto biasTensorAndData = CreateConstTensor(inputs[2], + biasTensorInfo, + armnn::Optional()); layer = m_Network->AddDepthwiseConvolution2dLayer(desc, filterTensorAndData.first, biasTensorAndData.first, @@ -1228,7 +1258,9 @@ void TfLiteParser::ParseFullyConnected(size_t subgraphIndex, size_t operatorInde % CHECK_LOCATION().AsString())); } - auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo); + auto filterTensorAndData = CreateConstTensor(inputs[1], + filterTensorInfo, + armnn::Optional()); armnn::IConnectableLayer* layer; auto layerName = boost::str(boost::format("FullyConnected:%1%:%2%") % subgraphIndex % operatorIndex); @@ -1236,7 +1268,9 @@ void TfLiteParser::ParseFullyConnected(size_t subgraphIndex, size_t operatorInde { desc.m_BiasEnabled = true; TensorInfo biasTensorInfo = ToTensorInfo(inputs[2]); - auto biasTensorAndData = CreateConstTensor(inputs[2], biasTensorInfo); + auto biasTensorAndData = CreateConstTensor(inputs[2], + biasTensorInfo, + armnn::Optional()); layer = m_Network->AddFullyConnectedLayer(desc, filterTensorAndData.first, biasTensorAndData.first, @@ -1561,9 +1595,25 @@ TfLiteParser::BufferRawPtr TfLiteParser::GetBuffer(const ModelPtr& model, size_t return model->buffers[bufferIndex].get(); } +template +std::pair +TfLiteParser::CreateConstTensorAndStoreData(TfLiteParser::BufferRawPtr bufferPtr, + TfLiteParser::TensorRawPtr tensorPtr, + armnn::TensorInfo& tensorInfo, + armnn::Optional permutationVector) +{ + auto constData = CreateConstTensorImpl(bufferPtr, + tensorPtr, + tensorInfo, + permutationVector); + TfLiteParser::SupportedDataStorage storage(std::move(constData.second)); + return std::make_pair(constData.first, std::move(storage)); +} + std::pair TfLiteParser::CreateConstTensor(TensorRawPtr tensorPtr, - armnn::TensorInfo & tensorInfo) + armnn::TensorInfo& tensorInfo, + armnn::Optional permutationVector) { CHECK_TENSOR_PTR(tensorPtr); auto bufferPtr = GetBuffer(m_Model, tensorPtr->buffer); @@ -1572,29 +1622,20 @@ TfLiteParser::CreateConstTensor(TensorRawPtr tensorPtr, switch (tensorInfo.GetDataType()) { case armnn::DataType::Float32: - { - auto constData = CreateConstTensorImpl(bufferPtr, - tensorPtr, - tensorInfo); - SupportedDataStorage storage(std::move(constData.second)); - return std::make_pair(constData.first, std::move(storage)); - } + return CreateConstTensorAndStoreData(bufferPtr, + tensorPtr, + tensorInfo, + permutationVector); case armnn::DataType::QuantisedAsymm8: - { - auto constData = CreateConstTensorImpl(bufferPtr, - tensorPtr, - tensorInfo); - SupportedDataStorage storage(std::move(constData.second)); - return std::make_pair(constData.first, std::move(storage)); - } + return CreateConstTensorAndStoreData(bufferPtr, + tensorPtr, + tensorInfo, + permutationVector); case armnn::DataType::Signed32: - { - auto constData = CreateConstTensorImpl(bufferPtr, - tensorPtr, - tensorInfo); - SupportedDataStorage storage(std::move(constData.second)); - return std::make_pair(constData.first, std::move(storage)); - } + return CreateConstTensorAndStoreData(bufferPtr, + tensorPtr, + tensorInfo, + permutationVector); default: { std::stringstream errString; diff --git a/src/armnnTfLiteParser/TfLiteParser.hpp b/src/armnnTfLiteParser/TfLiteParser.hpp index e7a7469..9195728 100644 --- a/src/armnnTfLiteParser/TfLiteParser.hpp +++ b/src/armnnTfLiteParser/TfLiteParser.hpp @@ -129,17 +129,31 @@ private: // We don't care about the content, and we want a single datatype to simplify the code. struct SupportedDataStorage { - std::unique_ptr m_FloatData; - std::unique_ptr m_Uint8Data; - std::unique_ptr m_Int32Data; - - SupportedDataStorage(std::unique_ptr && data); - SupportedDataStorage(std::unique_ptr && data); - SupportedDataStorage(std::unique_ptr && data); + public: + // Convenience constructors + SupportedDataStorage(std::unique_ptr&& data); + SupportedDataStorage(std::unique_ptr&& data); + SupportedDataStorage(std::unique_ptr&& data); + + private: + // Pointers to the data buffers + std::unique_ptr m_FloatData; + std::unique_ptr m_Uint8Data; + std::unique_ptr m_Int32Data; }; - std::pair CreateConstTensor(TensorRawPtr tensorPtr, - armnn::TensorInfo & tensorInfo); + + template + std::pair + CreateConstTensorAndStoreData(TfLiteParser::BufferRawPtr bufferPtr, + TfLiteParser::TensorRawPtr tensorPtr, + armnn::TensorInfo& tensorInfo, + armnn::Optional permutationVector); + + std::pair + CreateConstTensor(TensorRawPtr tensorPtr, + armnn::TensorInfo& tensorInfo, + armnn::Optional permutationVector); /// The network we're building. Gets cleared after it is passed to the user armnn::INetworkPtr m_Network; diff --git a/src/armnnTfParser/TfParser.cpp b/src/armnnTfParser/TfParser.cpp index 7f04757..7a213c0 100644 --- a/src/armnnTfParser/TfParser.cpp +++ b/src/armnnTfParser/TfParser.cpp @@ -1338,13 +1338,9 @@ ParsedTfOperationPtr TfParser::ParseDepthwiseConv2D(const tensorflow::NodeDef& n uint32_t inputWidth = inputTensorInfo.GetShape()[dataLayoutIndexed.GetWidthIndex()]; // Mappings from TensorFlow filter tensors to the ArmNN filter tensors. - // Tensorflow weights are [H, W, In, Out]. - // ArmNN weights have to be [Out, H, W, In] when the data layout is NHWC, - // and [Out, In, H, W] when the data layout is NCHW. - PermutationVector permutationVector = - dataLayout == DataLayout::NHWC ? - std::initializer_list{ 1, 2, 3, 0 } : // NHWC: [H, W, In, Out] -> [Out, H, W, In] - std::initializer_list{ 2, 3, 1, 0 }; // NCHW: [H, W, In, Out] -> [Out, In, H, W] + // Tensorflow weights come in the format [H, W, I, M]. + // ArmNN weights have to be [M, I, H, W]. + PermutationVector permutationVector{ 2, 3, 1, 0 }; // [H, W, I, M] -> [M, I, H, W] // Swizzle the tensor using the given permutation vector. const TensorInfo& weightTensorInfo = weightNode->GetTensorInfo(); @@ -1358,8 +1354,8 @@ ParsedTfOperationPtr TfParser::ParseDepthwiseConv2D(const tensorflow::NodeDef& n // Create a weight tensor with the newly swizzled data. ConstTensor weightTensor(weightTensorSwizzledInfo, weightTensorSwizzledData); - uint32_t weightHeight = weightTensor.GetShape()[dataLayoutIndexed.GetHeightIndex()]; - uint32_t weightWidth = weightTensor.GetShape()[dataLayoutIndexed.GetWidthIndex()]; + uint32_t weightHeight = weightTensor.GetShape()[2]; + uint32_t weightWidth = weightTensor.GetShape()[3]; bool padding = false; TensorInfo outputInfo; @@ -1393,7 +1389,7 @@ ParsedTfOperationPtr TfParser::ParseDepthwiseConv2D(const tensorflow::NodeDef& n outputInfo = TensorInfo({ inputTensorInfo.GetShape()[0], outputHeight, outputWidth, - weightTensor.GetShape()[0] * weightTensor.GetShape()[3]}, + weightTensor.GetShape()[0] * weightTensor.GetShape()[1]}, DataType::Float32); break; case DataLayout::NCHW: diff --git a/src/armnnUtils/ParserPrototxtFixture.hpp b/src/armnnUtils/ParserPrototxtFixture.hpp index fa21aba..acb8f82 100644 --- a/src/armnnUtils/ParserPrototxtFixture.hpp +++ b/src/armnnUtils/ParserPrototxtFixture.hpp @@ -14,8 +14,6 @@ #include #include -#include - #include #include diff --git a/src/armnnUtils/Permute.cpp b/src/armnnUtils/Permute.cpp index 61f4e0e..6deff90 100644 --- a/src/armnnUtils/Permute.cpp +++ b/src/armnnUtils/Permute.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace { @@ -46,10 +47,29 @@ public: Unroll(0, srcData, dstData, srcEnd, dstEnd); } + void Unroll(const void* srcData, void* dstData, size_t dataTypeSize) + { + assert(srcData); + assert(dstData); + assert(dataTypeSize > 0); + + const unsigned char* srcDataPtr = reinterpret_cast(srcData); + unsigned char* dstDataPtr = reinterpret_cast(dstData); + + const unsigned char* const srcEndPtr = srcDataPtr + m_DstShape.GetNumElements() * dataTypeSize; + unsigned char* const dstEndPtr = dstDataPtr + m_DstShape.GetNumElements() * dataTypeSize; + + Unroll(0, srcDataPtr, dstDataPtr, srcEndPtr, dstEndPtr, dataTypeSize); + } + private: template void Unroll(size_type dimension, const T* srcData, T* dstData, const T* srcEnd, T* dstEnd) { + assert(srcData); + assert(dstData); + assert(srcEnd); + assert(dstEnd); assert(srcData < srcEnd); assert(dstData < dstEnd); @@ -69,6 +89,35 @@ private: } } + void Unroll(size_type dimension, + const unsigned char* srcData, unsigned char* dstData, + const unsigned char* srcEnd, unsigned char* dstEnd, + size_t dataTypeSize) + { + assert(srcData); + assert(dstData); + assert(srcEnd); + assert(dstEnd); + assert(srcData < srcEnd); + assert(dstData < dstEnd); + assert(dataTypeSize > 0); + + if (dimension >= m_DstShape.GetNumDimensions()) + { + ::memcpy(dstData, srcData, dataTypeSize); + } + else + { + for (size_type i = 0; i < m_DstShape[dimension]; i++) + { + Unroll(dimension + 1, srcData, dstData, srcEnd, dstEnd, dataTypeSize); + + srcData += m_SrcStrides[dimension] * dataTypeSize; + dstData += m_DstStrides[dimension] * dataTypeSize; + } + } + } + armnn::TensorShape m_DstShape; std::array m_SrcStrides; std::array m_DstStrides; @@ -102,6 +151,12 @@ armnn::TensorInfo Permuted(const armnn::TensorInfo& info, const armnn::Permutati return outInfo; } +void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, + const void* src, void* dst, size_t dataTypeSize) +{ + PermuteLoop(dstShape, mappings).Unroll(src, dst, dataTypeSize); +} + template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, const T* src, T* dst) { @@ -117,5 +172,7 @@ template void Permute(const armnn::TensorShape& dstShape, const armnn::Permutati const uint8_t* src, uint8_t* dst); template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, const int32_t* src, int32_t* dst); +template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, + const bool* src, bool* dst); } // namespace armnnUtils diff --git a/src/armnnUtils/Permute.hpp b/src/armnnUtils/Permute.hpp index 700ddc7..4e43198 100644 --- a/src/armnnUtils/Permute.hpp +++ b/src/armnnUtils/Permute.hpp @@ -14,7 +14,10 @@ armnn::TensorShape Permuted(const armnn::TensorShape& srcShape, const armnn::Per armnn::TensorInfo Permuted(const armnn::TensorInfo& info, const armnn::PermutationVector& mappings); +void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, + const void* src, void* dst, size_t dataTypeSize); + template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, const T* src, T* dst); -} // namespace armnnUtils \ No newline at end of file +} // namespace armnnUtils diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.cpp b/src/backends/aclCommon/ArmComputeTensorUtils.cpp index a2d7d8c..32af42f 100644 --- a/src/backends/aclCommon/ArmComputeTensorUtils.cpp +++ b/src/backends/aclCommon/ArmComputeTensorUtils.cpp @@ -109,19 +109,6 @@ arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tenso return arm_compute::TensorInfo(aclTensorShape, 1, aclDataType, aclQuantizationInfo); } -arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout) -{ - switch(dataLayout) - { - case armnn::DataLayout::NHWC : return arm_compute::DataLayout::NHWC; - - case armnn::DataLayout::NCHW : return arm_compute::DataLayout::NCHW; - - default: throw InvalidArgumentException("Unknown armnn::DataLayout: [" + - std::to_string(static_cast(dataLayout)) + "]"); - } -} - arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo, armnn::DataLayout dataLayout) { @@ -136,6 +123,19 @@ arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tenso return clTensorInfo; } +arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout) +{ + switch(dataLayout) + { + case armnn::DataLayout::NHWC : return arm_compute::DataLayout::NHWC; + + case armnn::DataLayout::NCHW : return arm_compute::DataLayout::NCHW; + + default: throw InvalidArgumentException("Unknown armnn::DataLayout: [" + + std::to_string(static_cast(dataLayout)) + "]"); + } +} + arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor) { using arm_compute::PoolingType; diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.hpp b/src/backends/aclCommon/ArmComputeTensorUtils.hpp index fbd850c..fa455b7 100644 --- a/src/backends/aclCommon/ArmComputeTensorUtils.hpp +++ b/src/backends/aclCommon/ArmComputeTensorUtils.hpp @@ -36,16 +36,16 @@ arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& te /// armnn::ITensorInfo. arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo); -/// Utility function used to convert armnn::DataLayout to arm_compute::DataLayout -/// armnn::DataLayout. -arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout); - /// Utility function used to setup an arm_compute::ITensorInfo object whose dimensions are based on the given /// armnn::ITensorInfo. /// armnn::DataLayout. arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo, armnn::DataLayout dataLayout); +/// Utility function used to convert armnn::DataLayout to arm_compute::DataLayout +/// armnn::DataLayout. +arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout); + /// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor. arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor); diff --git a/src/backends/backendsCommon/CMakeLists.txt b/src/backends/backendsCommon/CMakeLists.txt index f295630..b120f51 100644 --- a/src/backends/backendsCommon/CMakeLists.txt +++ b/src/backends/backendsCommon/CMakeLists.txt @@ -27,6 +27,7 @@ list(APPEND armnnBackendsCommon_sources WorkloadFactory.hpp Workload.hpp WorkloadInfo.hpp + WorkloadUtils.cpp WorkloadUtils.hpp ) diff --git a/src/backends/backendsCommon/CpuTensorHandle.cpp b/src/backends/backendsCommon/CpuTensorHandle.cpp index fe0c634..9dcd3f3 100644 --- a/src/backends/backendsCommon/CpuTensorHandle.cpp +++ b/src/backends/backendsCommon/CpuTensorHandle.cpp @@ -18,7 +18,7 @@ ConstCpuTensorHandle::ConstCpuTensorHandle(const TensorInfo& tensorInfo) } template <> -const void* ConstCpuTensorHandle::GetConstTensor() const +const void* ConstCpuTensorHandle::GetConstTensor() const { return m_Memory; } @@ -30,7 +30,7 @@ CpuTensorHandle::CpuTensorHandle(const TensorInfo& tensorInfo) } template <> -void* CpuTensorHandle::GetTensor() const +void* CpuTensorHandle::GetTensor() const { return m_MutableMemory; } diff --git a/src/backends/backendsCommon/CpuTensorHandle.hpp b/src/backends/backendsCommon/CpuTensorHandle.hpp index ae13d6c..b88a0d3 100644 --- a/src/backends/backendsCommon/CpuTensorHandle.hpp +++ b/src/backends/backendsCommon/CpuTensorHandle.hpp @@ -72,6 +72,9 @@ private: const void* m_Memory; }; +template<> +const void* ConstCpuTensorHandle::GetConstTensor() const; + // Abstract specialization of ConstCpuTensorHandle that allows write access to the same data. class CpuTensorHandle : public ConstCpuTensorHandle { @@ -99,6 +102,9 @@ private: void* m_MutableMemory; }; +template <> +void* CpuTensorHandle::GetTensor() const; + // A CpuTensorHandle that owns the wrapped memory region. class ScopedCpuTensorHandle : public CpuTensorHandle { diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp index 8847b4e..1dac498 100644 --- a/src/backends/backendsCommon/WorkloadData.cpp +++ b/src/backends/backendsCommon/WorkloadData.cpp @@ -593,9 +593,10 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3; - //inputChannels * channelMultiplier should be equal to outputChannels. + // Expected weight shape: [ M, I, H, W ] - This shape does NOT depend on the data layout + // inputChannels * channelMultiplier should be equal to outputChannels. const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0]; - const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[channelIndex]; + const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1]; const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[channelIndex]; if (numWeightChannelMultiplier * numWeightInputChannels != numWeightOutputChannels) { diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp new file mode 100644 index 0000000..fa387a7 --- /dev/null +++ b/src/backends/backendsCommon/WorkloadUtils.cpp @@ -0,0 +1,111 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "WorkloadUtils.hpp" + +namespace armnn +{ + +armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor, + const PermutationVector& permutationVector, + void* permuteBuffer) +{ + BOOST_ASSERT_MSG(tensor, "Invalid input tensor"); + BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer"); + + TensorInfo tensorInfo = tensor->GetTensorInfo(); + + if (permutationVector.GetSize() > 0) + { + tensorInfo = armnnUtils::Permuted(tensorInfo, permutationVector); + armnnUtils::Permute(tensorInfo.GetShape(), permutationVector, + tensor->GetConstTensor(), permuteBuffer, + GetDataTypeSize(tensorInfo.GetDataType())); + } + else + { + ::memcpy(permuteBuffer, tensor->GetConstTensor(), tensorInfo.GetNumBytes()); + } + + return ConstTensor(tensorInfo, permuteBuffer); +} + +void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout) +{ + // Reshape the weights in-place + const TensorShape& weightShape = weightInfo.GetShape(); + switch (dataLayout) + { + case DataLayout::NHWC: + // The data layout is NHWC, reshape from [ H, W, I, M ] to [ 1, H, W, I * M ] + weightInfo.SetShape({ 1, + weightShape[0], + weightShape[1], + weightShape[2] * weightShape[3] }); + break; + case DataLayout::NCHW: + default: + // The data layout is NCHW, reshape from [ M, I, H, W ] to [ 1, I * M, H, W, ] + weightInfo.SetShape({ 1, + weightShape[0] * weightShape[1], + weightShape[2], + weightShape[3] }); + break; + } +} + +TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout) +{ + // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either + // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library + + // 1. Permute the weights if necessary + // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done + // starting from the current shape of [ M, I, H, W ] + TensorInfo weightPermutedInfo(weightInfo); + if (dataLayout == DataLayout::NHWC) + { + // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ] + PermutationVector permutationVector{ 3, 2, 0, 1 }; + weightPermutedInfo = armnnUtils::Permuted(weightInfo, permutationVector); + } + + // 2. Reshape the weights + ReshapeWeightsForAcl(weightPermutedInfo, dataLayout); + + // 3. Return the permuted weight info + return weightPermutedInfo; +} + +armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor, + DataLayout dataLayout, + void* permuteBuffer) +{ + BOOST_ASSERT_MSG(weightTensor, "Invalid input tensor"); + BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer"); + + // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either + // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library + + // 1. Permute the weights if necessary + // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done + // starting from the current shape of [ M, I, H, W ] + // If no permutation is necessary, leave the permutation vector empty + PermutationVector permutationVector{}; + if (dataLayout == DataLayout::NHWC) + { + // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ] + permutationVector = { 3, 2, 0, 1 }; + } + ConstTensor weightPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer); + + // 2. Reshape the weights + ReshapeWeightsForAcl(weightPermuted.GetInfo(), dataLayout); + + // 3. Return both the tensor and the allocated storage to ensure that the data stays alive + return weightPermuted; +} + +} // namespace armnn diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp index 2b07b2b..a1a8d2a 100644 --- a/src/backends/backendsCommon/WorkloadUtils.hpp +++ b/src/backends/backendsCommon/WorkloadUtils.hpp @@ -6,35 +6,42 @@ #pragma once #include "ITensorHandle.hpp" +#include "CpuTensorHandle.hpp" #include +#include +#include +#include + #include namespace armnn { namespace { + template void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg) { - if (idx >= num) - { - return; - } + if (idx >= num) + { + return; + } - arg = array[(num - 1) - idx]; - idx++; -}; + arg = array[(num - 1) - idx]; + idx++; +} template void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args) { - AssignValues(num, idx, array, assignee); + AssignValues(num, idx, array, assignee); - AssignValues(num, idx, array, args...); + AssignValues(num, idx, array, args...); } -} // namespace + +} // anonymous namespace template void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy) @@ -142,4 +149,16 @@ void GatherTensorHandlePairs(const DescriptorType& descriptor, } } -} //namespace armnn \ No newline at end of file +armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor, + const PermutationVector& permutationVector, + void* permuteBuffer); + +void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout); + +TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout); + +armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor, + DataLayout dataLayout, + void* permuteBuffer); + +} //namespace armnn diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk index a66b5c4..4e79bfc 100644 --- a/src/backends/backendsCommon/common.mk +++ b/src/backends/backendsCommon/common.mk @@ -14,7 +14,8 @@ COMMON_SOURCES := \ MemCopyWorkload.cpp \ OutputHandler.cpp \ WorkloadData.cpp \ - WorkloadFactory.cpp + WorkloadFactory.cpp \ + WorkloadUtils.cpp # COMMON_TEST_SOURCES contains the list of files to be included # in the Android unit test build (armnn-tests) and it is picked diff --git a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp index 37fa0f6..2ff66b0 100755 --- a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp +++ b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp @@ -327,7 +327,7 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( armnn::IWorkloadFactory& workloadFactory, const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, const boost::multi_array& input, - const boost::multi_array& originalKernel, + const boost::multi_array& kernel, const boost::multi_array& bias, const boost::multi_array& outputExpected, float qScale, @@ -344,10 +344,10 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( unsigned int inputChannels = boost::numeric_cast(input.shape()[1]); unsigned int inputHeight = boost::numeric_cast(input.shape()[2]); unsigned int inputWidth = boost::numeric_cast(input.shape()[3]); - unsigned int kernelChanMul = boost::numeric_cast(originalKernel.shape()[0]); - unsigned int kernelChannels = boost::numeric_cast(originalKernel.shape()[1]); - unsigned int kernelHeight = boost::numeric_cast(originalKernel.shape()[2]); - unsigned int kernelWidth = boost::numeric_cast(originalKernel.shape()[3]); + unsigned int kernelChanMul = boost::numeric_cast(kernel.shape()[0]); + unsigned int kernelChannels = boost::numeric_cast(kernel.shape()[1]); + unsigned int kernelHeight = boost::numeric_cast(kernel.shape()[2]); + unsigned int kernelWidth = boost::numeric_cast(kernel.shape()[3]); unsigned int outputNum = boost::numeric_cast(outputExpected.shape()[0]); unsigned int outputChannels = boost::numeric_cast(outputExpected.shape()[1]); unsigned int outputHeight = boost::numeric_cast(outputExpected.shape()[2]); @@ -362,8 +362,7 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( armnnUtils::GetTensorInfo(inputNum, inputChannels, inputHeight, inputWidth, layout); armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(outputNum, outputChannels, outputHeight, outputWidth, layout); - armnn::TensorInfo kernelDesc = - armnnUtils::GetTensorInfo(kernelChanMul, kernelChannels, kernelHeight, kernelWidth, layout); + armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType()); armnn::TensorInfo biasDesc({static_cast(bias.size())}, armnn::GetDataType()); // Set quantization parameters if the requested type is a quantized type. @@ -423,13 +422,6 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc); - // Permute the kernel if necessary - boost::multi_array kernel = boost::multi_array(originalKernel); - if (layout == armnn::DataLayout::NHWC) - { - armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernel.data(), kernel.data()); - } - AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]); armnn::ScopedCpuTensorHandle biasTensor(biasDesc); @@ -484,6 +476,7 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl( unsigned int kernelHeight = 3; unsigned int kernelWidth = 3; unsigned int kernelChannels = inputChannels; + unsigned int kernelDepthMultiplier = 1; unsigned int outputHeight = 1; unsigned int outputWidth = 1; @@ -494,7 +487,8 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl( armnnUtils::GetTensorInfo(inputNum, inputChannels, inputHeight, inputWidth, layout); armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(outputNum, outputChannels, outputHeight, outputWidth, layout); - armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo(1, outputChannels, kernelHeight, kernelWidth, layout); + armnn::TensorInfo kernelDesc({kernelDepthMultiplier, kernelChannels, kernelHeight, kernelWidth}, + armnn::GetDataType()); armnn::TensorInfo biasDesc({ outputChannels }, armnn::GetDataType()); // Set quantization parameters if the requested type is a quantized type. @@ -543,12 +537,6 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl( 0.f, 0.f, 0.f, -1.f, 0.f, -1.f, })); - if (layout == armnn::DataLayout::NHWC) - { - std::vector tmp(kernelData.size()); - armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, kernelData.data(), tmp.data()); - kernelData = tmp; - } auto kernel = MakeTensor(kernelDesc, kernelData); // Manually calculated. @@ -642,8 +630,8 @@ LayerTestResult DepthwiseConvolution2dTestImpl( inputBatchSize, inputChannels, inputHeight, inputWidth, layout); armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo( outputBatchSize, outputChannels, outputHeight, outputWidth, layout); - armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo( - depthMultiplier, inputChannels, kernelHeight, kernelWidth, layout); + armnn::TensorInfo kernelDesc({depthMultiplier, inputChannels, kernelHeight, kernelWidth}, + armnn::GetDataType()); armnn::TensorInfo biasDesc({outputChannels}, armnn::GetDataType()); // Set quantization parameters if the requested type is a quantized type. @@ -692,7 +680,7 @@ LayerTestResult DepthwiseConvolution2dTestImpl( {0, 2, 1, -1})); auto bias = MakeTensor(biasDesc, biasV); - std::vector originalKernelData = std::vector( + std::vector kernelData = std::vector( QuantizedVector(kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset(), { 1, 1, 1, 1, -1, 1, @@ -717,12 +705,8 @@ LayerTestResult DepthwiseConvolution2dTestImpl( 0, 1, 0, 0, 0, 0, 0, 0, 0 + })); - std::vector kernelData = originalKernelData; - if (layout == armnn::DataLayout::NHWC) - { - armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernelData.data(), kernelData.data()); - } auto kernel = MakeTensor(kernelDesc, kernelData); // Manually calculated. @@ -840,9 +824,9 @@ LayerTestResult DepthwiseConvolution2dNhwcTestImpl( unsigned int inputWidth = boost::numeric_cast(input.shape()[2]); unsigned int kernelChanMul = boost::numeric_cast(kernel.shape()[0]); - unsigned int kernelChannels = boost::numeric_cast(kernel.shape()[3]); - unsigned int kernelHeight = boost::numeric_cast(kernel.shape()[1]); - unsigned int kernelWidth = boost::numeric_cast(kernel.shape()[2]); + unsigned int kernelChannels = boost::numeric_cast(kernel.shape()[1]); + unsigned int kernelHeight = boost::numeric_cast(kernel.shape()[2]); + unsigned int kernelWidth = boost::numeric_cast(kernel.shape()[3]); unsigned int outputNum = boost::numeric_cast(outputExpected.shape()[0]); unsigned int outputChannels = boost::numeric_cast(outputExpected.shape()[3]); @@ -853,7 +837,7 @@ LayerTestResult DepthwiseConvolution2dNhwcTestImpl( armnn::TensorInfo inputTensorInfo({inputNum, inputHeight, inputWidth, inputChannels}, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo({outputNum, outputHeight, outputWidth, outputChannels}, armnn::GetDataType()); - armnn::TensorInfo kernelDesc({kernelChanMul, kernelHeight, kernelWidth, kernelChannels}, armnn::GetDataType()); + armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType()); armnn::TensorInfo biasDesc({static_cast(bias.size())}, armnn::GetDataType()); // Set quantization parameters if the requested type is a quantized type. @@ -1068,10 +1052,10 @@ LayerTestResult CompareConvolution2dTestImpl( armnn::TensorInfo kernelDesc; armnn::TensorInfo biasDesc; - unsigned int inputShape[] = {inputNum, inputChannels, inputHeight, inputWidth}; - unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth}; - unsigned int kernelShape[] = {outputChannels, inputChannels, kernelHeight, kernelWidth}; - unsigned int biasShape[] = {outputChannels}; + unsigned int inputShape[] = {inputNum, inputChannels, inputHeight, inputWidth}; + unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth}; + unsigned int kernelShape[] = {outputChannels, inputChannels, kernelHeight, kernelWidth}; + unsigned int biasShape[] = {outputChannels}; inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType()); outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType()); @@ -1171,19 +1155,17 @@ LayerTestResult CompareDepthwiseConvolution2dTestImpl( std::vector inputShape; std::vector outputShape; - std::vector kernelShape; - std::vector biasShape= { outputChannels }; + std::vector kernelShape{ channelMultiplier, inputChannels, kernelHeight, kernelWidth }; + std::vector biasShape{ outputChannels }; switch (layout.GetDataLayout()) { case armnn::DataLayout::NCHW: inputShape = { inputNum, inputChannels, inputHeight, inputWidth }; outputShape = { outputNum, outputChannels, outputHeight, outputWidth }; - kernelShape = { channelMultiplier, inputChannels, kernelHeight, kernelWidth }; break; case armnn::DataLayout ::NHWC: inputShape = { inputNum, inputHeight, inputWidth, inputChannels }; outputShape = { outputNum, outputHeight, outputWidth, outputChannels }; - kernelShape = { channelMultiplier, kernelHeight, kernelWidth, inputChannels }; break; default: throw armnn::InvalidArgumentException("unknown data layout [" diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp index ddf0d0b..819b9d6 100755 --- a/src/backends/backendsCommon/test/LayerTests.cpp +++ b/src/backends/backendsCommon/test/LayerTests.cpp @@ -661,28 +661,18 @@ LayerTestResult DepthwiseConvolution2dNhwcTestCommon( 24, 49 }))); - armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2}, armnn::GetDataType()); + armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType()); auto kernel = MakeTensor(kernelTensorInfo, std::vector( QuantizedVector(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), { - 32, 16, - 31, 15, - 30, 14, - 29, 13, - - 28, 12, - 27, 11, - 26, 10, - 25, 9, - - 24, 8, - 23, 7, - 22, 6, - 21, 5, - - 20, 4, - 19, 3, - 18, 2, - 17, 1 + 32, 31, 30, 29, + 28, 27, 26, 25, + 24, 23, 22, 21, + 20, 19, 18, 17, + + 16, 15, 14, 13, + 12, 11, 10, 9, + 8, 7, 6, 5, + 4, 3, 2, 1 }))); armnn::TensorInfo outputTensorInfo({ 1, 5, 5, 2}, armnn::GetDataType()); diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp index 9cadbf0..1745b82 100644 --- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp +++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -21,14 +22,23 @@ namespace armnn using namespace armcomputetensorutils; arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, - const TensorInfo& output, - const DepthwiseConvolution2dDescriptor& descriptor, - const TensorInfo& weights, - const Optional& biases) + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases) { - const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); - const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); + + // ArmNN's weight format is [ M, I, H, W ] + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either + // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library + TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout); + + // Convert the weights into the compute library format + const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout); arm_compute::TensorInfo aclBiasesInfo; arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; @@ -42,7 +52,6 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp } const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor); - const unsigned int aclDepthMultiplier = weights.GetShape()[0]; return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo, &aclWeightsInfo, @@ -57,10 +66,18 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( const WorkloadInfo& info) : BaseWorkload(descriptor, info) { - auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); + // Allocate a buffer for the swizzling of the weight tensor + std::unique_ptr permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]); + + // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either + // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library + ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight, + m_Data.m_Parameters.m_DataLayout, + permuteBuffer.get()); + // Convert the weights into the compute library format m_KernelTensor = std::make_unique(); - BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout); + BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout); if (m_Data.m_Parameters.m_BiasEnabled) { @@ -86,13 +103,14 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); - const unsigned int depthMultiplier = weightInfo.GetShape()[0]; + // ArmNN's weight format is [ M, I, H, W ] + auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - const unsigned int widthIndex = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 3 : 2; - const unsigned int heightIndex = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 2 : 1; + // Get the depth multiplier + const unsigned int depthMultiplier = weightInfo.GetShape()[0]; - //Check for optimisation opportunities. - bool use3x3Optimisation = (weightInfo.GetShape()[widthIndex] == 3) && (weightInfo.GetShape()[heightIndex] == 3); + // Check for optimisation opportunities. + bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3); if (use3x3Optimisation) { m_DepthwiseConvolutionLayer = std::make_unique(); @@ -118,7 +136,8 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( BOOST_ASSERT(m_DepthwiseConvolutionLayer); - InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight); + ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted); + InitializeArmComputeClTensorData(*m_KernelTensor, &weightsPermutedHandle); if (m_BiasTensor) { diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp index 6cad12c..be26359 100644 --- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp +++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp @@ -8,10 +8,7 @@ #include #include #include - -#include - -using namespace armnnUtils; +#include namespace armnn { @@ -19,17 +16,23 @@ namespace armnn using namespace armcomputetensorutils; arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, - const TensorInfo& output, - const DepthwiseConvolution2dDescriptor& descriptor, - const TensorInfo& weights, - const Optional& biases) + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases) { - const arm_compute::TensorInfo aclInputInfo = - BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); - const arm_compute::TensorInfo aclOutputInfo = - BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); - const arm_compute::TensorInfo aclWeightsInfo = - BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); + + // ArmNN's weight format is [ M, I, H, W ] + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either + // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library + TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout); + + // Convert the weights into the compute library format + const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout); arm_compute::TensorInfo aclBiasesInfo; arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; @@ -42,9 +45,7 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i optionalAclBiasesInfo = &aclBiasesInfo; } - const arm_compute::PadStrideInfo aclPadStrideInfo = - BuildArmComputePadStrideInfo(descriptor); - const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor); return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo, &aclWeightsInfo, @@ -59,14 +60,21 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( const WorkloadInfo& info) : BaseWorkload(descriptor, info) { - const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); + // ArmNN's weight format is [ M, I, H, W ] + auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - m_KernelTensor = std::make_unique(); - BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout); + // Allocate a buffer for the swizzling of the weight tensor + std::unique_ptr permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]); - INeonTensorHandle* inputTensorHandle = static_cast(m_Data.m_Inputs[0]); - INeonTensorHandle* outputTensorHandle = static_cast(m_Data.m_Outputs[0]); - DataLayoutIndexed dataLayoutIndex(m_Data.m_Parameters.m_DataLayout); + // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either + // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library + ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight, + m_Data.m_Parameters.m_DataLayout, + permuteBuffer.get()); + + // Convert the weights into the compute library format + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout); if (m_Data.m_Parameters.m_BiasEnabled) { @@ -84,6 +92,9 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionWorkload", 1, 1); + INeonTensorHandle* inputTensorHandle = static_cast(m_Data.m_Inputs[0]); + INeonTensorHandle* outputTensorHandle = static_cast(m_Data.m_Outputs[0]); + arm_compute::ITensor& input = inputTensorHandle->GetTensor(); arm_compute::ITensor& output = outputTensorHandle->GetTensor(); @@ -91,9 +102,11 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); - bool use3x3Optimisation = weightInfo.GetShape()[dataLayoutIndex.GetWidthIndex()] == 3 && - weightInfo.GetShape()[dataLayoutIndex.GetHeightIndex()] == 3; + // Get the depth multiplier + const unsigned int depthMultiplier = weightInfo.GetShape()[0]; + // Check for optimisation opportunities. + bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3); if (use3x3Optimisation) { m_pDepthwiseConvolutionLayer = std::make_unique(); @@ -102,7 +115,8 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( m_KernelTensor.get(), m_BiasTensor.get(), &output, - padStrideInfo); + padStrideInfo, + depthMultiplier); } else { @@ -112,12 +126,14 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( m_KernelTensor.get(), m_BiasTensor.get(), &output, - padStrideInfo); + padStrideInfo, + depthMultiplier); } BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - InitializeArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight); + ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted); + InitializeArmComputeTensorData(*m_KernelTensor, &weightsPermutedHandle); if (m_Data.m_Parameters.m_BiasEnabled) { diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp index 704bc36..5c07f57 100644 --- a/src/backends/reference/workloads/ConvImpl.hpp +++ b/src/backends/reference/workloads/ConvImpl.hpp @@ -57,7 +57,6 @@ static void ConvImpl(ConvData data, float filterScale, int32_t filterOffset, const BiasType* biasData, - InputType* outputData, float outputScale, int32_t outputOffset, const TensorInfo& filterInfo, @@ -68,10 +67,10 @@ static void ConvImpl(ConvData data, throw InvalidArgumentException("Bias is enabled but the bias data is invalid"); } - const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]); - const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]); + const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]); + const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]); - TensorBufferArrayView output(outputInfo0.GetShape(), + TensorBufferArrayView output(outputInfo.GetShape(), GetOutputTensorData(0, data), data.m_Parameters.m_DataLayout); @@ -81,18 +80,18 @@ static void ConvImpl(ConvData data, const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex(); const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex(); - unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1; - unsigned int channelsInput = filterInfo.GetShape()[channelsIndex]; - unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0]; + unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1; + unsigned int inputChannels = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex]; + unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0]; - unsigned int batchSize = outputInfo0.GetShape()[0]; - unsigned int heightOutput = outputInfo0.GetShape()[heightIndex]; - unsigned int widthOutput = outputInfo0.GetShape()[widthIndex]; - unsigned int heightInput = inputInfo0.GetShape()[heightIndex]; - unsigned int widthInput = inputInfo0.GetShape()[widthIndex]; + unsigned int batchSize = outputInfo.GetShape()[0]; + unsigned int outputHeight = outputInfo.GetShape()[heightIndex]; + unsigned int outputWidth = outputInfo.GetShape()[widthIndex]; + unsigned int inputHeight = inputInfo.GetShape()[heightIndex]; + unsigned int inputWidth = inputInfo.GetShape()[widthIndex]; - unsigned int heightFilter = filterInfo.GetShape()[heightIndex]; - unsigned int widthFilter = filterInfo.GetShape()[widthIndex]; + unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex]; + unsigned int filterWidth = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex]; unsigned int paddingTop = data.m_Parameters.m_PadTop; unsigned int paddingLeft = data.m_Parameters.m_PadLeft; @@ -102,68 +101,56 @@ static void ConvImpl(ConvData data, // The world's least efficient convolution. for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) { - for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++) + for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++) { - for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++) + for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++) { - for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++) + for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++) { // This loop goes over each output element. AccumulatorType sum = AccumulatorType(); // For depthwise, each output channel corresponds to exactly one input channel. // For normal, must loop over each input channel. - for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++) + for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++) { unsigned int depthwiseMultiplierIdx = 0; if (depthwise) { - cInput = cOutput / depthMult; - depthwiseMultiplierIdx = cOutput % depthMult; + cInput = cOutput / depthMultiplier; + depthwiseMultiplierIdx = cOutput % depthMultiplier; } - for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++) + for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++) { - for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++) + for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++) { // This loop goes over each input element for each output element. - unsigned int filterIndex; + unsigned int filterIndex = 0; // Since dimensionality of kernel depends on depthwiseness, so does index. if (depthwise) { - if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) - { - filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter - * channelsInput + - yFilter * widthFilter * channelsInput + - xFilter * channelsInput + - cInput; - } - else - { - filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter - * channelsInput + - cInput * widthFilter * heightFilter + - yFilter * widthFilter + - xFilter; - } + filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels + + cInput * filterWidth * filterHeight + + yFilter * filterWidth + + xFilter; } else { if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) { - filterIndex = cOutput * heightFilter * widthFilter * channelsInput + - yFilter * widthFilter * channelsInput + - xFilter * channelsInput + + filterIndex = cOutput * filterHeight * filterWidth * inputChannels + + yFilter * filterWidth * inputChannels + + xFilter * inputChannels + cInput; } else { - filterIndex = cOutput * widthFilter * heightFilter * channelsInput + - cInput * widthFilter * heightFilter + - yFilter * widthFilter + + filterIndex = cOutput * filterWidth * filterHeight * inputChannels + + cInput * filterWidth * filterHeight + + yFilter * filterWidth + xFilter; } } @@ -177,8 +164,8 @@ static void ConvImpl(ConvData data, AccumulatorType inputValue; // Check if we're in the padding. - if (yInput < paddingTop || yInput >= heightInput + paddingTop || - xInput < paddingLeft || xInput >= widthInput + paddingLeft ) + if (yInput < paddingTop || yInput >= inputHeight + paddingTop || + xInput < paddingLeft || xInput >= inputWidth + paddingLeft ) { inputValue = AccumulatorType(); } @@ -188,17 +175,17 @@ static void ConvImpl(ConvData data, if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) { - inputIndex = batchIdx * heightInput * widthInput * channelsInput + - (yInput - paddingTop) * widthInput * channelsInput + - (xInput - paddingLeft) * channelsInput + + inputIndex = batchIdx * inputHeight * inputWidth * inputChannels + + (yInput - paddingTop) * inputWidth * inputChannels + + (xInput - paddingLeft) * inputChannels + cInput; } else { - inputIndex = batchIdx * widthInput * heightInput * channelsInput + - widthInput * heightInput * cInput + - widthInput * (yInput - paddingTop) + + inputIndex = batchIdx * inputWidth * inputHeight * inputChannels + + inputWidth * inputHeight * cInput + + inputWidth * (yInput - paddingTop) + xInput - paddingLeft; } diff --git a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp index 2090564..7b298df 100644 --- a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp +++ b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp @@ -23,15 +23,13 @@ void RefConvolution2dFloat32Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvolution2dFloat32Workload_Execute"); - float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Weight->template GetConstTensor(); - const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Bias->template GetConstTensor() : nullptr; + const float* filterData = m_Weight->template GetConstTensor(); + const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor() : nullptr; const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl( - m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo); + m_Data, inputData, 0.0f, 0, filterData, 0.0f, 0, biasData, 0.0f, 0, filterInfo); } } //namespace armnn diff --git a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp index 881e9bf..af2c7ad 100644 --- a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp +++ b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp @@ -27,10 +27,7 @@ void RefConvolution2dUint8Workload::Execute() const const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); const uint8_t* weightsData = m_Weight->template GetConstTensor(); const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get()); - const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Bias->template GetConstTensor() : - nullptr; - uint8_t* outputData = GetOutputTensorDataU8(0, m_Data); + const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor() : nullptr; const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); @@ -39,7 +36,7 @@ void RefConvolution2dUint8Workload::Execute() const inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), biasData, - outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo); + outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo); } } //namespace armnn diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp index e89013b..756e958 100644 --- a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp +++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp @@ -23,15 +23,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dFloat32Workload_Execute"); - float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); const float* weightData = m_Weight->template GetConstTensor(); - const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Bias->template GetConstTensor() : nullptr; + const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor() : nullptr; const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl - (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true); + (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, 0.0f, 0, filterInfo, true); } } //namespace armnn diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp index e8e501d..629b729 100644 --- a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp +++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp @@ -28,10 +28,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); const uint8_t* weightsData = m_Weight->template GetConstTensor(); const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get()); - const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Bias->template GetConstTensor() : - nullptr; - uint8_t* outputData = GetOutputTensorDataU8(0, m_Data); + const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor() : nullptr; const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); @@ -40,7 +37,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), biasData, - outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true); + outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true); } } //namespace armnn -- 2.7.4