From 747ef82c88f9afe14a8b80b6b3b34118353e97f2 Mon Sep 17 00:00:00 2001
From: Matteo Martincigh <matteo.martincigh@arm.com>
Date: Tue, 18 Dec 2018 09:26:39 +0000
Subject: [PATCH] MLCE-77 Depthwise Convolution with depth multiplier > 1
 doesn't work

 * Unified ArmNN's weight format to [ M, I, H, W ] for the depthwise convolution
 * Added conversion utilities to permute/reshape the weights as appropriate
   when using CL and Neon backends
 * Updated the reference implementation of the convolution
 * Updated the relevant unit tests accordingly

!android-nn-driver:459

Change-Id: I07d0818efa9d1ca1e5dad82983aac1fe78eadb18
---
 src/armnn/layers/DepthwiseConvolution2dLayer.cpp   |  40 ++++----
 src/armnn/test/CreateWorkload.hpp                  |  20 ++--
 src/armnn/test/OptimizerTests.cpp                  |   2 +-
 src/armnnTfLiteParser/TfLiteParser.cpp             | 105 +++++++++++++------
 src/armnnTfLiteParser/TfLiteParser.hpp             |  32 ++++--
 src/armnnTfParser/TfParser.cpp                     |  16 ++-
 src/armnnUtils/ParserPrototxtFixture.hpp           |   2 -
 src/armnnUtils/Permute.cpp                         |  57 +++++++++++
 src/armnnUtils/Permute.hpp                         |   5 +-
 src/backends/aclCommon/ArmComputeTensorUtils.cpp   |  26 ++---
 src/backends/aclCommon/ArmComputeTensorUtils.hpp   |   8 +-
 src/backends/backendsCommon/CMakeLists.txt         |   1 +
 src/backends/backendsCommon/CpuTensorHandle.cpp    |   4 +-
 src/backends/backendsCommon/CpuTensorHandle.hpp    |   6 ++
 src/backends/backendsCommon/WorkloadData.cpp       |   5 +-
 src/backends/backendsCommon/WorkloadUtils.cpp      | 111 +++++++++++++++++++++
 src/backends/backendsCommon/WorkloadUtils.hpp      |  41 ++++++--
 src/backends/backendsCommon/common.mk              |   3 +-
 .../backendsCommon/test/Conv2dTestImpl.hpp         |  64 +++++-------
 src/backends/backendsCommon/test/LayerTests.cpp    |  30 ++----
 .../workloads/ClDepthwiseConvolutionWorkload.cpp   |  49 ++++++---
 .../workloads/NeonDepthwiseConvolutionWorkload.cpp |  72 +++++++------
 src/backends/reference/workloads/ConvImpl.hpp      |  93 ++++++++---------
 .../workloads/RefConvolution2dFloat32Workload.cpp  |   8 +-
 .../workloads/RefConvolution2dUint8Workload.cpp    |   7 +-
 .../RefDepthwiseConvolution2dFloat32Workload.cpp   |   6 +-
 .../RefDepthwiseConvolution2dUint8Workload.cpp     |   7 +-
 27 files changed, 529 insertions(+), 291 deletions(-)
 create mode 100644 src/backends/backendsCommon/WorkloadUtils.cpp
diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp
index 95d4690..c4edc20 100644
--- a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp
+++ b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp
@@ -24,7 +24,7 @@ DepthwiseConvolution2dLayer::DepthwiseConvolution2dLayer(const DepthwiseConvolut
 {
 }
 
-std::unique_ptr<IWorkload> DepthwiseConvolution2dLayer::CreateWorkload(const Graph&                  graph,
+std::unique_ptr<IWorkload> DepthwiseConvolution2dLayer::CreateWorkload(const Graph& graph,
                                                                        const IWorkloadFactory& factory) const
 {
     // on this level constant data should not be released..
@@ -59,34 +59,40 @@ std::vector<TensorShape>
 DepthwiseConvolution2dLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const
 {
     BOOST_ASSERT(inputShapes.size() == 2);
-    const TensorShape& inputShape = inputShapes[0];
-    const TensorShape filterShape = inputShapes[1];
+    const TensorShape& inputShape  = inputShapes[0];
+    const TensorShape& filterShape = inputShapes[1];
 
     BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Convolutions will always have 4D input.");
 
     DataLayoutIndexed dataLayoutIndex(m_Param.m_DataLayout);
 
-    unsigned int inWidth = inputShape[dataLayoutIndex.GetWidthIndex()];
-    unsigned int inHeight = inputShape[dataLayoutIndex.GetHeightIndex()];
-    unsigned int inBatchSize = inputShape[0];
+    unsigned int inputBatchSize = inputShape[0];
+    unsigned int inputHeight    = inputShape[dataLayoutIndex.GetHeightIndex()];
+    unsigned int inputWidth     = inputShape[dataLayoutIndex.GetWidthIndex()];
+    unsigned int inputChannels  = inputShape[dataLayoutIndex.GetChannelsIndex()];
 
-    unsigned int filterWidth = filterShape[dataLayoutIndex.GetWidthIndex()];
-    unsigned int readWidth = (inWidth + m_Param.m_PadLeft + m_Param.m_PadRight) - (filterWidth);
-    unsigned int outWidth =  1 + (readWidth / m_Param.m_StrideX);
+    // Expected filter shape: [ M, I, H, W ] - This shape does NOT depend on the data layout
+    // Namely: [ depth multiplier, input channels, filter height, filter width ]
+    // Output channels = input channels * depthMultiplier
 
-    unsigned int filterHeight = filterShape[dataLayoutIndex.GetHeightIndex()];
-    unsigned int readHeight = (inHeight + m_Param.m_PadTop + m_Param.m_PadBottom) - (filterHeight);
-    unsigned int outHeight = 1 + (readHeight / m_Param.m_StrideY);
     unsigned int depthMultiplier = filterShape[0];
 
-    unsigned int outChannels = filterShape[dataLayoutIndex.GetChannelsIndex()] * depthMultiplier;
-    unsigned int outBatchSize = inBatchSize;
+    unsigned int filterHeight = filterShape[2];
+    unsigned int readHeight   = (inputHeight + m_Param.m_PadTop + m_Param.m_PadBottom) - filterHeight;
+    unsigned int outputHeight = 1 + (readHeight / m_Param.m_StrideY);
+
+    unsigned int filterWidth = filterShape[3];
+    unsigned int readWidth   = (inputWidth + m_Param.m_PadLeft + m_Param.m_PadRight) - filterWidth;
+    unsigned int outputWidth = 1 + (readWidth / m_Param.m_StrideX);
+
+    unsigned int outputChannels  = inputChannels * depthMultiplier;
+    unsigned int outputBatchSize = inputBatchSize;
 
     TensorShape tensorShape = m_Param.m_DataLayout == armnn::DataLayout::NHWC ?
-        TensorShape( { outBatchSize, outHeight, outWidth, outChannels } ) :
-        TensorShape( { outBatchSize, outChannels, outHeight, outWidth });
+                              TensorShape{ outputBatchSize, outputHeight, outputWidth, outputChannels } :
+                              TensorShape{ outputBatchSize, outputChannels, outputHeight, outputWidth };
 
-    return std::vector<TensorShape>({ tensorShape });
+    return std::vector<TensorShape>{ tensorShape };
 }
 
 void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs()
diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp
index 3dc18b9..f52f605 100644
--- a/src/armnn/test/CreateWorkload.hpp
+++ b/src/armnn/test/CreateWorkload.hpp
@@ -414,18 +414,18 @@ std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolutio
 {
     // Creates the layer we're testing.
     DepthwiseConvolution2dDescriptor layerDesc;
-    layerDesc.m_PadLeft         = 1;
-    layerDesc.m_PadRight        = 2;
-    layerDesc.m_PadTop          = 1;
-    layerDesc.m_PadBottom       = 2;
-    layerDesc.m_StrideX         = 1;
-    layerDesc.m_StrideY         = 1;
-    layerDesc.m_BiasEnabled     = false;
-    layerDesc.m_DataLayout = dataLayout;
+    layerDesc.m_PadLeft     = 1;
+    layerDesc.m_PadRight    = 2;
+    layerDesc.m_PadTop      = 1;
+    layerDesc.m_PadBottom   = 2;
+    layerDesc.m_StrideX     = 1;
+    layerDesc.m_StrideY     = 1;
+    layerDesc.m_BiasEnabled = false;
+    layerDesc.m_DataLayout  = dataLayout;
 
     DepthwiseConvolution2dLayer* const layer = graph.AddLayer<DepthwiseConvolution2dLayer>(layerDesc, "layer");
 
-    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({1, 4, 4, 2}, DataType));
+    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({1, 2, 4, 4}, DataType)); // [ M, I, H, W ]
     layer->m_Weight->Allocate();
 
     // Creates extra layers.
@@ -457,7 +457,7 @@ std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolutio
 
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
-    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({1, 4, 4, 2}, DataType)));
+    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({1, 2, 4, 4}, DataType)));
 
     // Returns so we can do extra, backend-specific tests.
     return workload;
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 29d1702..80addb4 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -898,7 +898,7 @@ BOOST_AUTO_TEST_CASE(DepthwiseConv2dValidateTensorShapesFromInputsNhwc)
 {
     Graph graph;
     const unsigned int inputShape[] = { 1, 3, 3, 2 };
-    const unsigned int weightsShape[] = { 1, 3, 3, 2 };
+    const unsigned int weightsShape[] = { 1, 2, 3, 3 };
     const unsigned int outputShape[] = { 1, 1, 1, 2 };
     CreateDepthwiseConvolution2dGraph(graph, inputShape, weightsShape, outputShape, DataLayout::NHWC);
 
diff --git a/src/armnnTfLiteParser/TfLiteParser.cpp b/src/armnnTfLiteParser/TfLiteParser.cpp
index 49bc737..3b50476 100644
--- a/src/armnnTfLiteParser/TfLiteParser.cpp
+++ b/src/armnnTfLiteParser/TfLiteParser.cpp
@@ -401,7 +401,8 @@ template<typename T>
 std::pair<armnn::ConstTensor, std::unique_ptr<T[]>>
 CreateConstTensorImpl(TfLiteParser::BufferRawPtr bufferPtr,
                       TfLiteParser::TensorRawPtr tensorPtr,
-                      armnn::TensorInfo & tensorInfo)
+                      armnn::TensorInfo& tensorInfo,
+                      armnn::Optional<armnn::PermutationVector&> permutationVector)
 {
     BOOST_ASSERT_MSG(tensorPtr != nullptr, "tensorPtr is null");
     BOOST_ASSERT_MSG(bufferPtr != nullptr,
@@ -409,7 +410,20 @@ CreateConstTensorImpl(TfLiteParser::BufferRawPtr bufferPtr,
             boost::format("Buffer for buffer:%1% is null") % tensorPtr->buffer).c_str());
 
     std::unique_ptr<T[]> data(new T[tensorInfo.GetNumElements()]);
-    ::memcpy(data.get(), bufferPtr->data.data(), tensorInfo.GetNumBytes());
+
+    if (permutationVector.has_value() && permutationVector.value().GetSize() > 0)
+    {
+        tensorInfo = armnnUtils::Permuted(tensorInfo, permutationVector.value());
+        armnnUtils::Permute(tensorInfo.GetShape(),
+                            permutationVector.value(),
+                            reinterpret_cast<const T *>(bufferPtr->data.data()),
+                            data.get());
+    }
+    else
+    {
+        ::memcpy(data.get(), bufferPtr->data.data(), tensorInfo.GetNumBytes());
+    }
+
     return std::make_pair(ConstTensor(tensorInfo, data.get()), std::move(data));
 }
 
@@ -660,7 +674,9 @@ void TfLiteParser::ParseConv2D(size_t subgraphIndex, size_t operatorIndex)
     CalcPadding(inputHeight, filterHeight, desc.m_StrideY, desc.m_PadTop, desc.m_PadBottom, options->padding);
     CalcPadding(inputWidth, filterWidth, desc.m_StrideX, desc.m_PadLeft, desc.m_PadRight, options->padding);
 
-    auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo);
+    auto filterTensorAndData = CreateConstTensor(inputs[1],
+                                                 filterTensorInfo,
+                                                 armnn::Optional<armnn::PermutationVector&>());
     armnn::IConnectableLayer* layer;
 
     auto layerName = boost::str(boost::format("Conv2D:%1%:%2%") % subgraphIndex % operatorIndex);
@@ -669,7 +685,9 @@ void TfLiteParser::ParseConv2D(size_t subgraphIndex, size_t operatorIndex)
     {
         desc.m_BiasEnabled = true;
         armnn::TensorInfo biasTensorInfo = ToTensorInfo(inputs[2]);
-        auto biasTensorAndData = CreateConstTensor(inputs[2], biasTensorInfo);
+        auto biasTensorAndData = CreateConstTensor(inputs[2],
+                                                   biasTensorInfo,
+                                                   armnn::Optional<armnn::PermutationVector&>());
         layer = m_Network->AddConvolution2dLayer(desc,
                                                  filterTensorAndData.first,
                                                  biasTensorAndData.first,
@@ -723,17 +741,27 @@ void TfLiteParser::ParseDepthwiseConv2D(size_t subgraphIndex, size_t operatorInd
     armnn::TensorInfo inputTensorInfo  = ToTensorInfo(inputs[0]);
     armnn::TensorInfo filterTensorInfo = ToTensorInfo(inputs[1]);
 
-    // assuming input is NHWC
+    // Assuming input is NHWC
     unsigned int inputHeight = inputTensorInfo.GetShape()[1];
     unsigned int inputWidth  = inputTensorInfo.GetShape()[2];
-    // assuming the filter is OHWI : Output, H, W, Input
+
+    // TensorflowLite weights come in the format [1, H, W, I * M]
     unsigned int filterHeight = filterTensorInfo.GetShape()[1];
     unsigned int filterWidth  = filterTensorInfo.GetShape()[2];
 
+    // Reshape weights as [ H, W, I, M ]
+    filterTensorInfo.SetShape({ filterHeight,
+                                filterWidth,
+                                inputTensorInfo.GetShape()[3],
+                                filterTensorInfo.GetShape()[3] / inputTensorInfo.GetShape()[3] });
+
+    // Mappings from TensorflowLite filter tensors to the ArmNN filter tensors (ArmNN weights have to be [M, I, H, W])
+    PermutationVector permutationVector{ 2, 3, 1, 0 }; // [H, W, I, M] -> [M, I, H, W]
+
     CalcPadding(inputHeight, filterHeight, desc.m_StrideY, desc.m_PadTop, desc.m_PadBottom, options->padding);
     CalcPadding(inputWidth, filterWidth, desc.m_StrideX, desc.m_PadLeft, desc.m_PadRight, options->padding);
 
-    auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo);
+    auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo, permutationVector);
     armnn::IConnectableLayer* layer;
     auto layerName = boost::str(boost::format("DepthwiseConv2D:%1%:%2%") % subgraphIndex % operatorIndex);
 
@@ -741,7 +769,9 @@ void TfLiteParser::ParseDepthwiseConv2D(size_t subgraphIndex, size_t operatorInd
     {
         desc.m_BiasEnabled = true;
         TensorInfo biasTensorInfo = ToTensorInfo(inputs[2]);
-        auto biasTensorAndData = CreateConstTensor(inputs[2], biasTensorInfo);
+        auto biasTensorAndData = CreateConstTensor(inputs[2],
+                                                   biasTensorInfo,
+                                                   armnn::Optional<armnn::PermutationVector&>());
         layer = m_Network->AddDepthwiseConvolution2dLayer(desc,
                                                           filterTensorAndData.first,
                                                           biasTensorAndData.first,
@@ -1228,7 +1258,9 @@ void TfLiteParser::ParseFullyConnected(size_t subgraphIndex, size_t operatorInde
                 % CHECK_LOCATION().AsString()));
     }
 
-    auto filterTensorAndData = CreateConstTensor(inputs[1], filterTensorInfo);
+    auto filterTensorAndData = CreateConstTensor(inputs[1],
+                                                 filterTensorInfo,
+                                                 armnn::Optional<armnn::PermutationVector&>());
     armnn::IConnectableLayer* layer;
     auto layerName = boost::str(boost::format("FullyConnected:%1%:%2%") % subgraphIndex % operatorIndex);
 
@@ -1236,7 +1268,9 @@ void TfLiteParser::ParseFullyConnected(size_t subgraphIndex, size_t operatorInde
     {
         desc.m_BiasEnabled = true;
         TensorInfo biasTensorInfo = ToTensorInfo(inputs[2]);
-        auto biasTensorAndData = CreateConstTensor(inputs[2], biasTensorInfo);
+        auto biasTensorAndData = CreateConstTensor(inputs[2],
+                                                   biasTensorInfo,
+                                                   armnn::Optional<armnn::PermutationVector&>());
         layer = m_Network->AddFullyConnectedLayer(desc,
                                                   filterTensorAndData.first,
                                                   biasTensorAndData.first,
@@ -1561,9 +1595,25 @@ TfLiteParser::BufferRawPtr TfLiteParser::GetBuffer(const ModelPtr& model, size_t
     return model->buffers[bufferIndex].get();
 }
 
+template<typename T>
+std::pair<armnn::ConstTensor, TfLiteParser::SupportedDataStorage>
+TfLiteParser::CreateConstTensorAndStoreData(TfLiteParser::BufferRawPtr bufferPtr,
+                                            TfLiteParser::TensorRawPtr tensorPtr,
+                                            armnn::TensorInfo& tensorInfo,
+                                            armnn::Optional<armnn::PermutationVector&> permutationVector)
+{
+    auto constData = CreateConstTensorImpl<T>(bufferPtr,
+                                              tensorPtr,
+                                              tensorInfo,
+                                              permutationVector);
+    TfLiteParser::SupportedDataStorage storage(std::move(constData.second));
+    return std::make_pair(constData.first, std::move(storage));
+}
+
 std::pair<armnn::ConstTensor, TfLiteParser::SupportedDataStorage>
 TfLiteParser::CreateConstTensor(TensorRawPtr tensorPtr,
-                                armnn::TensorInfo & tensorInfo)
+                                armnn::TensorInfo& tensorInfo,
+                                armnn::Optional<armnn::PermutationVector&> permutationVector)
 {
     CHECK_TENSOR_PTR(tensorPtr);
     auto bufferPtr = GetBuffer(m_Model, tensorPtr->buffer);
@@ -1572,29 +1622,20 @@ TfLiteParser::CreateConstTensor(TensorRawPtr tensorPtr,
     switch (tensorInfo.GetDataType())
     {
         case armnn::DataType::Float32:
-        {
-            auto constData = CreateConstTensorImpl<float>(bufferPtr,
-                                                          tensorPtr,
-                                                          tensorInfo);
-            SupportedDataStorage storage(std::move(constData.second));
-            return std::make_pair(constData.first, std::move(storage));
-        }
+            return CreateConstTensorAndStoreData<float>(bufferPtr,
+                                                        tensorPtr,
+                                                        tensorInfo,
+                                                        permutationVector);
         case armnn::DataType::QuantisedAsymm8:
-        {
-            auto constData = CreateConstTensorImpl<uint8_t>(bufferPtr,
-                                                            tensorPtr,
-                                                            tensorInfo);
-            SupportedDataStorage storage(std::move(constData.second));
-            return std::make_pair(constData.first, std::move(storage));
-        }
+            return CreateConstTensorAndStoreData<uint8_t>(bufferPtr,
+                                                          tensorPtr,
+                                                          tensorInfo,
+                                                          permutationVector);
         case armnn::DataType::Signed32:
-        {
-            auto constData = CreateConstTensorImpl<int32_t>(bufferPtr,
-                                                            tensorPtr,
-                                                            tensorInfo);
-            SupportedDataStorage storage(std::move(constData.second));
-            return std::make_pair(constData.first, std::move(storage));
-        }
+            return CreateConstTensorAndStoreData<int32_t>(bufferPtr,
+                                                          tensorPtr,
+                                                          tensorInfo,
+                                                          permutationVector);
         default:
         {
             std::stringstream errString;
diff --git a/src/armnnTfLiteParser/TfLiteParser.hpp b/src/armnnTfLiteParser/TfLiteParser.hpp
index e7a7469..9195728 100644
--- a/src/armnnTfLiteParser/TfLiteParser.hpp
+++ b/src/armnnTfLiteParser/TfLiteParser.hpp
@@ -129,17 +129,31 @@ private:
     // We don't care about the content, and we want a single datatype to simplify the code.
     struct SupportedDataStorage
     {
-        std::unique_ptr<float[]>    m_FloatData;
-        std::unique_ptr<uint8_t[]>  m_Uint8Data;
-        std::unique_ptr<int32_t[]>  m_Int32Data;
-
-        SupportedDataStorage(std::unique_ptr<float[]> && data);
-        SupportedDataStorage(std::unique_ptr<uint8_t[]> && data);
-        SupportedDataStorage(std::unique_ptr<int32_t[]> && data);
+    public:
+        // Convenience constructors
+        SupportedDataStorage(std::unique_ptr<float[]>&&   data);
+        SupportedDataStorage(std::unique_ptr<uint8_t[]>&& data);
+        SupportedDataStorage(std::unique_ptr<int32_t[]>&& data);
+
+    private:
+        // Pointers to the data buffers
+        std::unique_ptr<float[]>   m_FloatData;
+        std::unique_ptr<uint8_t[]> m_Uint8Data;
+        std::unique_ptr<int32_t[]> m_Int32Data;
     };
 
-    std::pair<armnn::ConstTensor, SupportedDataStorage> CreateConstTensor(TensorRawPtr tensorPtr,
-                                                                          armnn::TensorInfo & tensorInfo);
+
+    template<typename T>
+    std::pair<armnn::ConstTensor, TfLiteParser::SupportedDataStorage>
+    CreateConstTensorAndStoreData(TfLiteParser::BufferRawPtr bufferPtr,
+                                  TfLiteParser::TensorRawPtr tensorPtr,
+                                  armnn::TensorInfo& tensorInfo,
+                                  armnn::Optional<armnn::PermutationVector&> permutationVector);
+
+    std::pair<armnn::ConstTensor, SupportedDataStorage>
+    CreateConstTensor(TensorRawPtr tensorPtr,
+                      armnn::TensorInfo& tensorInfo,
+                      armnn::Optional<armnn::PermutationVector&> permutationVector);
 
     /// The network we're building. Gets cleared after it is passed to the user
     armnn::INetworkPtr                    m_Network;
diff --git a/src/armnnTfParser/TfParser.cpp b/src/armnnTfParser/TfParser.cpp
index 7f04757..7a213c0 100644
--- a/src/armnnTfParser/TfParser.cpp
+++ b/src/armnnTfParser/TfParser.cpp
@@ -1338,13 +1338,9 @@ ParsedTfOperationPtr TfParser::ParseDepthwiseConv2D(const tensorflow::NodeDef& n
     uint32_t inputWidth  = inputTensorInfo.GetShape()[dataLayoutIndexed.GetWidthIndex()];
 
     // Mappings from TensorFlow filter tensors to the ArmNN filter tensors.
-    // Tensorflow weights are [H, W, In, Out].
-    // ArmNN weights have to be [Out, H, W, In] when the data layout is NHWC,
-    // and [Out, In, H, W] when the data layout is NCHW.
-    PermutationVector permutationVector =
-            dataLayout == DataLayout::NHWC ?
-            std::initializer_list<unsigned int>{ 1, 2, 3, 0 } : // NHWC: [H, W, In, Out] -> [Out, H, W, In]
-            std::initializer_list<unsigned int>{ 2, 3, 1, 0 };  // NCHW: [H, W, In, Out] -> [Out, In, H, W]
+    // Tensorflow weights come in the format [H, W, I, M].
+    // ArmNN weights have to be [M, I, H, W].
+    PermutationVector permutationVector{ 2, 3, 1, 0 }; // [H, W, I, M] -> [M, I, H, W]
 
     // Swizzle the tensor using the given permutation vector.
     const TensorInfo& weightTensorInfo = weightNode->GetTensorInfo();
@@ -1358,8 +1354,8 @@ ParsedTfOperationPtr TfParser::ParseDepthwiseConv2D(const tensorflow::NodeDef& n
     // Create a weight tensor with the newly swizzled data.
     ConstTensor weightTensor(weightTensorSwizzledInfo, weightTensorSwizzledData);
 
-    uint32_t weightHeight = weightTensor.GetShape()[dataLayoutIndexed.GetHeightIndex()];
-    uint32_t weightWidth  = weightTensor.GetShape()[dataLayoutIndexed.GetWidthIndex()];
+    uint32_t weightHeight = weightTensor.GetShape()[2];
+    uint32_t weightWidth  = weightTensor.GetShape()[3];
 
     bool padding = false;
     TensorInfo outputInfo;
@@ -1393,7 +1389,7 @@ ParsedTfOperationPtr TfParser::ParseDepthwiseConv2D(const tensorflow::NodeDef& n
             outputInfo = TensorInfo({ inputTensorInfo.GetShape()[0],
                                       outputHeight,
                                       outputWidth,
-                                      weightTensor.GetShape()[0] * weightTensor.GetShape()[3]},
+                                      weightTensor.GetShape()[0] * weightTensor.GetShape()[1]},
                                     DataType::Float32);
             break;
         case DataLayout::NCHW:
diff --git a/src/armnnUtils/ParserPrototxtFixture.hpp b/src/armnnUtils/ParserPrototxtFixture.hpp
index fa21aba..acb8f82 100644
--- a/src/armnnUtils/ParserPrototxtFixture.hpp
+++ b/src/armnnUtils/ParserPrototxtFixture.hpp
@@ -14,8 +14,6 @@
 #include <Network.hpp>
 #include <VerificationHelpers.hpp>
 
-#include <backendsCommon/BackendRegistry.hpp>
-
 #include <boost/format.hpp>
 
 #include <string>
diff --git a/src/armnnUtils/Permute.cpp b/src/armnnUtils/Permute.cpp
index 61f4e0e..6deff90 100644
--- a/src/armnnUtils/Permute.cpp
+++ b/src/armnnUtils/Permute.cpp
@@ -9,6 +9,7 @@
 #include <armnn/Tensor.hpp>
 
 #include <cassert>
+#include <cstring>
 
 namespace
 {
@@ -46,10 +47,29 @@ public:
         Unroll(0, srcData, dstData, srcEnd, dstEnd);
     }
 
+    void Unroll(const void* srcData, void* dstData, size_t dataTypeSize)
+    {
+        assert(srcData);
+        assert(dstData);
+        assert(dataTypeSize > 0);
+
+        const unsigned char* srcDataPtr = reinterpret_cast<const unsigned char*>(srcData);
+        unsigned char* dstDataPtr       = reinterpret_cast<unsigned char*>(dstData);
+
+        const unsigned char* const srcEndPtr = srcDataPtr + m_DstShape.GetNumElements() * dataTypeSize;
+        unsigned char* const       dstEndPtr = dstDataPtr + m_DstShape.GetNumElements() * dataTypeSize;
+
+        Unroll(0, srcDataPtr, dstDataPtr, srcEndPtr, dstEndPtr, dataTypeSize);
+    }
+
 private:
     template <typename T>
     void Unroll(size_type dimension, const T* srcData, T* dstData, const T* srcEnd, T* dstEnd)
     {
+        assert(srcData);
+        assert(dstData);
+        assert(srcEnd);
+        assert(dstEnd);
         assert(srcData < srcEnd);
         assert(dstData < dstEnd);
 
@@ -69,6 +89,35 @@ private:
         }
     }
 
+    void Unroll(size_type dimension,
+                const unsigned char* srcData, unsigned char* dstData,
+                const unsigned char* srcEnd, unsigned char* dstEnd,
+                size_t dataTypeSize)
+    {
+        assert(srcData);
+        assert(dstData);
+        assert(srcEnd);
+        assert(dstEnd);
+        assert(srcData < srcEnd);
+        assert(dstData < dstEnd);
+        assert(dataTypeSize > 0);
+
+        if (dimension >= m_DstShape.GetNumDimensions())
+        {
+            ::memcpy(dstData, srcData, dataTypeSize);
+        }
+        else
+        {
+            for (size_type i = 0; i < m_DstShape[dimension]; i++)
+            {
+                Unroll(dimension + 1, srcData, dstData, srcEnd, dstEnd, dataTypeSize);
+
+                srcData += m_SrcStrides[dimension] * dataTypeSize;
+                dstData += m_DstStrides[dimension] * dataTypeSize;
+            }
+        }
+    }
+
     armnn::TensorShape m_DstShape;
     std::array<size_type, armnn::MaxNumOfTensorDimensions> m_SrcStrides;
     std::array<size_type, armnn::MaxNumOfTensorDimensions> m_DstStrides;
@@ -102,6 +151,12 @@ armnn::TensorInfo Permuted(const armnn::TensorInfo& info, const armnn::Permutati
     return outInfo;
 }
 
+void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
+             const void* src, void* dst, size_t dataTypeSize)
+{
+    PermuteLoop(dstShape, mappings).Unroll(src, dst, dataTypeSize);
+}
+
 template <typename T>
 void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, const T* src, T* dst)
 {
@@ -117,5 +172,7 @@ template void Permute(const armnn::TensorShape& dstShape, const armnn::Permutati
                       const uint8_t* src, uint8_t* dst);
 template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
                       const int32_t* src, int32_t* dst);
+template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
+                      const bool* src, bool* dst);
 
 } // namespace armnnUtils
diff --git a/src/armnnUtils/Permute.hpp b/src/armnnUtils/Permute.hpp
index 700ddc7..4e43198 100644
--- a/src/armnnUtils/Permute.hpp
+++ b/src/armnnUtils/Permute.hpp
@@ -14,7 +14,10 @@ armnn::TensorShape Permuted(const armnn::TensorShape& srcShape, const armnn::Per
 
 armnn::TensorInfo Permuted(const armnn::TensorInfo& info, const armnn::PermutationVector& mappings);
 
+void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
+             const void* src, void* dst, size_t dataTypeSize);
+
 template <typename T>
 void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings, const T* src, T* dst);
 
-} // namespace armnnUtils
\ No newline at end of file
+} // namespace armnnUtils
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.cpp b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
index a2d7d8c..32af42f 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.cpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
@@ -109,19 +109,6 @@ arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tenso
     return arm_compute::TensorInfo(aclTensorShape, 1, aclDataType, aclQuantizationInfo);
 }
 
-arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout)
-{
-    switch(dataLayout)
-    {
-        case armnn::DataLayout::NHWC : return arm_compute::DataLayout::NHWC;
-
-        case armnn::DataLayout::NCHW : return arm_compute::DataLayout::NCHW;
-
-        default: throw InvalidArgumentException("Unknown armnn::DataLayout: [" +
-                                                std::to_string(static_cast<int>(dataLayout)) + "]");
-    }
-}
-
 arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo,
                                                   armnn::DataLayout dataLayout)
 {
@@ -136,6 +123,19 @@ arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tenso
     return clTensorInfo;
 }
 
+arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout)
+{
+    switch(dataLayout)
+    {
+        case armnn::DataLayout::NHWC : return arm_compute::DataLayout::NHWC;
+
+        case armnn::DataLayout::NCHW : return arm_compute::DataLayout::NCHW;
+
+        default: throw InvalidArgumentException("Unknown armnn::DataLayout: [" +
+                                                std::to_string(static_cast<int>(dataLayout)) + "]");
+    }
+}
+
 arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor)
 {
     using arm_compute::PoolingType;
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.hpp b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
index fbd850c..fa455b7 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
@@ -36,16 +36,16 @@ arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& te
 /// armnn::ITensorInfo.
 arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo);
 
-/// Utility function used to convert armnn::DataLayout to arm_compute::DataLayout
-/// armnn::DataLayout.
-arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout);
-
 /// Utility function used to setup an arm_compute::ITensorInfo object whose dimensions are based on the given
 /// armnn::ITensorInfo.
 /// armnn::DataLayout.
 arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo,
                                                   armnn::DataLayout dataLayout);
 
+/// Utility function used to convert armnn::DataLayout to arm_compute::DataLayout
+/// armnn::DataLayout.
+arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout);
+
 /// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor.
 arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor);
 
diff --git a/src/backends/backendsCommon/CMakeLists.txt b/src/backends/backendsCommon/CMakeLists.txt
index f295630..b120f51 100644
--- a/src/backends/backendsCommon/CMakeLists.txt
+++ b/src/backends/backendsCommon/CMakeLists.txt
@@ -27,6 +27,7 @@ list(APPEND armnnBackendsCommon_sources
     WorkloadFactory.hpp
     Workload.hpp
     WorkloadInfo.hpp
+    WorkloadUtils.cpp
     WorkloadUtils.hpp
 )
 
diff --git a/src/backends/backendsCommon/CpuTensorHandle.cpp b/src/backends/backendsCommon/CpuTensorHandle.cpp
index fe0c634..9dcd3f3 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.cpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.cpp
@@ -18,7 +18,7 @@ ConstCpuTensorHandle::ConstCpuTensorHandle(const TensorInfo& tensorInfo)
 }
 
 template <>
-const void* ConstCpuTensorHandle::GetConstTensor() const
+const void* ConstCpuTensorHandle::GetConstTensor<void>() const
 {
     return m_Memory;
 }
@@ -30,7 +30,7 @@ CpuTensorHandle::CpuTensorHandle(const TensorInfo& tensorInfo)
 }
 
 template <>
-void* CpuTensorHandle::GetTensor() const
+void* CpuTensorHandle::GetTensor<void>() const
 {
     return m_MutableMemory;
 }
diff --git a/src/backends/backendsCommon/CpuTensorHandle.hpp b/src/backends/backendsCommon/CpuTensorHandle.hpp
index ae13d6c..b88a0d3 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.hpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.hpp
@@ -72,6 +72,9 @@ private:
     const void* m_Memory;
 };
 
+template<>
+const void* ConstCpuTensorHandle::GetConstTensor<void>() const;
+
 // Abstract specialization of ConstCpuTensorHandle that allows write access to the same data.
 class CpuTensorHandle : public ConstCpuTensorHandle
 {
@@ -99,6 +102,9 @@ private:
     void* m_MutableMemory;
 };
 
+template <>
+void* CpuTensorHandle::GetTensor<void>() const;
+
 // A CpuTensorHandle that owns the wrapped memory region.
 class ScopedCpuTensorHandle : public CpuTensorHandle
 {
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 8847b4e..1dac498 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -593,9 +593,10 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
 
     const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3;
 
-    //inputChannels * channelMultiplier should be equal to outputChannels.
+    // Expected weight shape: [ M, I, H, W ] - This shape does NOT depend on the data layout
+    // inputChannels * channelMultiplier should be equal to outputChannels.
     const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0];
-    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[channelIndex];
+    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1];
     const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[channelIndex];
     if (numWeightChannelMultiplier * numWeightInputChannels != numWeightOutputChannels)
     {
diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp
new file mode 100644
index 0000000..fa387a7
--- /dev/null
+++ b/src/backends/backendsCommon/WorkloadUtils.cpp
@@ -0,0 +1,111 @@
+//
+// Copyright Â© 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
+                                 const PermutationVector& permutationVector,
+                                 void* permuteBuffer)
+{
+    BOOST_ASSERT_MSG(tensor, "Invalid input tensor");
+    BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
+
+    TensorInfo tensorInfo = tensor->GetTensorInfo();
+
+    if (permutationVector.GetSize() > 0)
+    {
+        tensorInfo = armnnUtils::Permuted(tensorInfo, permutationVector);
+        armnnUtils::Permute(tensorInfo.GetShape(), permutationVector,
+                            tensor->GetConstTensor<void>(), permuteBuffer,
+                            GetDataTypeSize(tensorInfo.GetDataType()));
+    }
+    else
+    {
+        ::memcpy(permuteBuffer, tensor->GetConstTensor<void>(), tensorInfo.GetNumBytes());
+    }
+
+    return ConstTensor(tensorInfo, permuteBuffer);
+}
+
+void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout)
+{
+    // Reshape the weights in-place
+    const TensorShape& weightShape = weightInfo.GetShape();
+    switch (dataLayout)
+    {
+        case DataLayout::NHWC:
+            // The data layout is NHWC, reshape from [ H, W, I, M ] to [ 1, H, W, I * M ]
+            weightInfo.SetShape({ 1,
+                                  weightShape[0],
+                                  weightShape[1],
+                                  weightShape[2] * weightShape[3] });
+            break;
+        case DataLayout::NCHW:
+        default:
+            // The data layout is NCHW, reshape from [ M, I, H, W ] to [ 1, I * M, H, W, ]
+            weightInfo.SetShape({ 1,
+                                  weightShape[0] * weightShape[1],
+                                  weightShape[2],
+                                  weightShape[3] });
+            break;
+    }
+}
+
+TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout)
+{
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+
+    // 1. Permute the weights if necessary
+    // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done
+    // starting from the current shape of [ M, I, H, W ]
+    TensorInfo weightPermutedInfo(weightInfo);
+    if (dataLayout == DataLayout::NHWC)
+    {
+        // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ]
+        PermutationVector permutationVector{ 3, 2, 0, 1 };
+        weightPermutedInfo = armnnUtils::Permuted(weightInfo, permutationVector);
+    }
+
+    // 2. Reshape the weights
+    ReshapeWeightsForAcl(weightPermutedInfo, dataLayout);
+
+    // 3. Return the permuted weight info
+    return weightPermutedInfo;
+}
+
+armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor,
+                                                     DataLayout dataLayout,
+                                                     void* permuteBuffer)
+{
+    BOOST_ASSERT_MSG(weightTensor, "Invalid input tensor");
+    BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+
+    // 1. Permute the weights if necessary
+    // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done
+    // starting from the current shape of [ M, I, H, W ]
+    // If no permutation is necessary, leave the permutation vector empty
+    PermutationVector permutationVector{};
+    if (dataLayout == DataLayout::NHWC)
+    {
+        // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ]
+        permutationVector = { 3, 2, 0, 1 };
+    }
+    ConstTensor weightPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer);
+
+    // 2. Reshape the weights
+    ReshapeWeightsForAcl(weightPermuted.GetInfo(), dataLayout);
+
+    // 3. Return both the tensor and the allocated storage to ensure that the data stays alive
+    return weightPermuted;
+}
+
+} // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp
index 2b07b2b..a1a8d2a 100644
--- a/src/backends/backendsCommon/WorkloadUtils.hpp
+++ b/src/backends/backendsCommon/WorkloadUtils.hpp
@@ -6,35 +6,42 @@
 #pragma once
 
 #include "ITensorHandle.hpp"
+#include "CpuTensorHandle.hpp"
 
 #include <armnn/Tensor.hpp>
 
+#include <Permute.hpp>
+#include <Profiling.hpp>
+#include <Half.hpp>
+
 #include <boost/cast.hpp>
 
 namespace armnn
 {
 namespace
 {
+
 template<typename ArrayType, typename Arg>
 void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
 {
- if (idx >= num)
- {
-     return;
- }
+    if (idx >= num)
+    {
+        return;
+    }
 
- arg = array[(num - 1) - idx];
- idx++;
-};
+    arg = array[(num - 1) - idx];
+    idx++;
+}
 
 template<typename T, typename ArrayType, typename ...Args>
 void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args)
 {
- AssignValues(num, idx, array, assignee);
+    AssignValues(num, idx, array, assignee);
 
- AssignValues(num, idx, array, args...);
+    AssignValues(num, idx, array, args...);
 }
-} // namespace
+
+} // anonymous namespace
 
 template<typename CopyFunc>
 void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
@@ -142,4 +149,16 @@ void GatherTensorHandlePairs(const DescriptorType& descriptor,
     }
 }
 
-} //namespace armnn
\ No newline at end of file
+armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
+                                 const PermutationVector& permutationVector,
+                                 void* permuteBuffer);
+
+void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout);
+
+TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout);
+
+armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor,
+                                                     DataLayout dataLayout,
+                                                     void* permuteBuffer);
+
+} //namespace armnn
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index a66b5c4..4e79bfc 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -14,7 +14,8 @@ COMMON_SOURCES := \
     MemCopyWorkload.cpp \
     OutputHandler.cpp \
     WorkloadData.cpp \
-    WorkloadFactory.cpp
+    WorkloadFactory.cpp \
+    WorkloadUtils.cpp
 
 # COMMON_TEST_SOURCES contains the list of files to be included
 # in the Android unit test build (armnn-tests) and it is picked
diff --git a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
index 37fa0f6..2ff66b0 100755
--- a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
+++ b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
@@ -327,7 +327,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const boost::multi_array<T, 4>& input,
-    const boost::multi_array<T, 4>& originalKernel,
+    const boost::multi_array<T, 4>& kernel,
     const boost::multi_array<B, 1>& bias,
     const boost::multi_array<T, 4>& outputExpected,
     float qScale,
@@ -344,10 +344,10 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
     unsigned int inputChannels  = boost::numeric_cast<unsigned int>(input.shape()[1]);
     unsigned int inputHeight    = boost::numeric_cast<unsigned int>(input.shape()[2]);
     unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[3]);
-    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(originalKernel.shape()[0]);
-    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(originalKernel.shape()[1]);
-    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(originalKernel.shape()[2]);
-    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(originalKernel.shape()[3]);
+    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
     unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
     unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
     unsigned int outputHeight   = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
@@ -362,8 +362,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
             armnnUtils::GetTensorInfo<T>(inputNum, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo =
             armnnUtils::GetTensorInfo<T>(outputNum, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc =
-            armnnUtils::GetTensorInfo<T>(kernelChanMul, kernelChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -423,13 +422,6 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
 
     armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
 
-    // Permute the kernel if necessary
-    boost::multi_array<T, 4> kernel = boost::multi_array<T, 4>(originalKernel);
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernel.data(), kernel.data());
-    }
-
     AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
 
     armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
@@ -484,6 +476,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
     unsigned int kernelHeight = 3;
     unsigned int kernelWidth = 3;
     unsigned int kernelChannels = inputChannels;
+    unsigned int kernelDepthMultiplier = 1;
 
     unsigned int outputHeight = 1;
     unsigned int outputWidth = 1;
@@ -494,7 +487,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
             armnnUtils::GetTensorInfo<T>(inputNum, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo =
             armnnUtils::GetTensorInfo<T>(outputNum, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo<T>(1, outputChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({kernelDepthMultiplier, kernelChannels, kernelHeight, kernelWidth},
+                                 armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({ outputChannels }, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -543,12 +537,6 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
                     0.f, 0.f,  0.f,
                     -1.f, 0.f, -1.f,
             }));
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        std::vector<T> tmp(kernelData.size());
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, kernelData.data(), tmp.data());
-        kernelData = tmp;
-    }
     auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
 
     // Manually calculated.
@@ -642,8 +630,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
             inputBatchSize, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo<T>(
             outputBatchSize, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo<T>(
-            depthMultiplier, inputChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({depthMultiplier, inputChannels, kernelHeight, kernelWidth},
+                                 armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({outputChannels}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -692,7 +680,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
         {0, 2, 1, -1}));
     auto bias = MakeTensor<B, 1>(biasDesc, biasV);
 
-    std::vector<T> originalKernelData = std::vector<T>(
+    std::vector<T> kernelData = std::vector<T>(
             QuantizedVector<T>(kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset(), {
                     1, 1, 1,
                     1, -1, 1,
@@ -717,12 +705,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
                     0, 1, 0,
                     0, 0, 0,
                     0, 0, 0
+
             }));
-    std::vector<T> kernelData = originalKernelData;
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernelData.data(), kernelData.data());
-    }
     auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
 
     // Manually calculated.
@@ -840,9 +824,9 @@ LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestImpl(
     unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[2]);
 
     unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
-    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
-    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
-    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
 
     unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
     unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
@@ -853,7 +837,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestImpl(
     armnn::TensorInfo inputTensorInfo({inputNum, inputHeight, inputWidth, inputChannels}, armnn::GetDataType<T>());
     armnn::TensorInfo outputTensorInfo({outputNum, outputHeight, outputWidth, outputChannels},
                                        armnn::GetDataType<T>());
-    armnn::TensorInfo kernelDesc({kernelChanMul, kernelHeight, kernelWidth, kernelChannels}, armnn::GetDataType<T>());
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -1068,10 +1052,10 @@ LayerTestResult<T,4> CompareConvolution2dTestImpl(
     armnn::TensorInfo kernelDesc;
     armnn::TensorInfo biasDesc;
 
-    unsigned int inputShape[]    = {inputNum, inputChannels, inputHeight, inputWidth};
-    unsigned int outputShape[]   = {outputNum, outputChannels, outputHeight, outputWidth};
-    unsigned int kernelShape[]   = {outputChannels, inputChannels, kernelHeight, kernelWidth};
-    unsigned int biasShape[]     = {outputChannels};
+    unsigned int inputShape[]  = {inputNum, inputChannels, inputHeight, inputWidth};
+    unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth};
+    unsigned int kernelShape[] = {outputChannels, inputChannels, kernelHeight, kernelWidth};
+    unsigned int biasShape[]   = {outputChannels};
 
     inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
     outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
@@ -1171,19 +1155,17 @@ LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
 
     std::vector<unsigned int> inputShape;
     std::vector<unsigned int> outputShape;
-    std::vector<unsigned int> kernelShape;
-    std::vector<unsigned int> biasShape= { outputChannels };
+    std::vector<unsigned int> kernelShape{ channelMultiplier, inputChannels, kernelHeight, kernelWidth };
+    std::vector<unsigned int> biasShape{ outputChannels };
     switch (layout.GetDataLayout())
     {
         case armnn::DataLayout::NCHW:
             inputShape =  { inputNum, inputChannels, inputHeight, inputWidth };
             outputShape = { outputNum, outputChannels, outputHeight, outputWidth };
-            kernelShape = { channelMultiplier, inputChannels, kernelHeight, kernelWidth };
             break;
         case armnn::DataLayout ::NHWC:
             inputShape =  { inputNum, inputHeight, inputWidth, inputChannels };
             outputShape = { outputNum, outputHeight, outputWidth, outputChannels };
-            kernelShape = { channelMultiplier, kernelHeight, kernelWidth, inputChannels };
             break;
         default:
             throw armnn::InvalidArgumentException("unknown data layout ["
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index ddf0d0b..819b9d6 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -661,28 +661,18 @@ LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestCommon(
             24, 49
         })));
 
-    armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2}, armnn::GetDataType<T>());
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType<T>());
     auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
         QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), {
-             32, 16,
-             31, 15,
-             30, 14,
-             29, 13,
-
-             28, 12,
-             27, 11,
-             26, 10,
-             25,  9,
-
-             24,  8,
-             23,  7,
-             22,  6,
-             21,  5,
-
-             20,  4,
-             19,  3,
-             18,  2,
-             17,  1
+             32, 31, 30, 29,
+             28, 27, 26, 25,
+             24, 23, 22, 21,
+             20, 19, 18, 17,
+
+             16, 15, 14, 13,
+             12, 11, 10,  9,
+              8,  7,  6,  5,
+              4,  3,  2,  1
         })));
 
     armnn::TensorInfo outputTensorInfo({ 1, 5, 5, 2}, armnn::GetDataType<T>());
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
index 9cadbf0..1745b82 100644
--- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
@@ -12,6 +12,7 @@
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <cl/ClTensorHandle.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/WorkloadUtils.hpp>
 
 #include <arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h>
 
@@ -21,14 +22,23 @@ namespace armnn
 using namespace armcomputetensorutils;
 
 arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
-    const TensorInfo& output,
-    const DepthwiseConvolution2dDescriptor& descriptor,
-    const TensorInfo& weights,
-    const Optional<TensorInfo>& biases)
+                                                           const TensorInfo& output,
+                                                           const DepthwiseConvolution2dDescriptor& descriptor,
+                                                           const TensorInfo& weights,
+                                                           const Optional<TensorInfo>& biases)
 {
-    const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input,  descriptor.m_DataLayout);
     const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+
+    // ArmNN's weight format is [ M, I, H, W ]
+    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout);
+
+    // Convert the weights into the compute library format
+    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout);
 
     arm_compute::TensorInfo aclBiasesInfo;
     arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
@@ -42,7 +52,6 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp
     }
 
     const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
-    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
 
     return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo,
                                                               &aclWeightsInfo,
@@ -57,10 +66,18 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
     const WorkloadInfo& info)
     : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
-    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
+    // Allocate a buffer for the swizzling of the weight tensor
+    std::unique_ptr<unsigned char[]> permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]);
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight,
+                                                                   m_Data.m_Parameters.m_DataLayout,
+                                                                   permuteBuffer.get());
 
+    // Convert the weights into the compute library format
     m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
+    BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
@@ -86,13 +103,14 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
+    // ArmNN's weight format is [ M, I, H, W ]
+    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
-    const unsigned int widthIndex = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 3 : 2;
-    const unsigned int heightIndex = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 2 : 1;
+    // Get the depth multiplier
+    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
 
-    //Check for optimisation opportunities.
-    bool use3x3Optimisation = (weightInfo.GetShape()[widthIndex] == 3) && (weightInfo.GetShape()[heightIndex] == 3);
+    // Check for optimisation opportunities.
+    bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3);
     if (use3x3Optimisation)
     {
         m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
@@ -118,7 +136,8 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
 
     BOOST_ASSERT(m_DepthwiseConvolutionLayer);
 
-    InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight);
+    ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted);
+    InitializeArmComputeClTensorData(*m_KernelTensor, &weightsPermutedHandle);
 
     if (m_BiasTensor)
     {
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
index 6cad12c..be26359 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
@@ -8,10 +8,7 @@
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <neon/NeonLayerSupport.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
-
-#include <DataLayoutIndexed.hpp>
-
-using namespace armnnUtils;
+#include <backendsCommon/WorkloadUtils.hpp>
 
 namespace armnn
 {
@@ -19,17 +16,23 @@ namespace armnn
 using namespace armcomputetensorutils;
 
 arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
-    const TensorInfo& output,
-    const DepthwiseConvolution2dDescriptor& descriptor,
-    const TensorInfo& weights,
-    const Optional<TensorInfo>& biases)
+                                                             const TensorInfo& output,
+                                                             const DepthwiseConvolution2dDescriptor& descriptor,
+                                                             const TensorInfo& weights,
+                                                             const Optional<TensorInfo>& biases)
 {
-    const arm_compute::TensorInfo aclInputInfo =
-        BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclOutputInfo =
-        BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclWeightsInfo =
-        BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input,  descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+
+    // ArmNN's weight format is [ M, I, H, W ]
+    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout);
+
+    // Convert the weights into the compute library format
+    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout);
 
     arm_compute::TensorInfo aclBiasesInfo;
     arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
@@ -42,9 +45,7 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i
         optionalAclBiasesInfo = &aclBiasesInfo;
     }
 
-    const arm_compute::PadStrideInfo aclPadStrideInfo =
-        BuildArmComputePadStrideInfo(descriptor);
-    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+    const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
 
     return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo,
                                                               &aclWeightsInfo,
@@ -59,14 +60,21 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
     const WorkloadInfo& info)
     : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
-    const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+    // ArmNN's weight format is [ M, I, H, W ]
+    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
-    m_KernelTensor = std::make_unique<arm_compute::Tensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
+    // Allocate a buffer for the swizzling of the weight tensor
+    std::unique_ptr<unsigned char[]> permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]);
 
-    INeonTensorHandle* inputTensorHandle = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0]);
-    INeonTensorHandle* outputTensorHandle =  static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0]);
-    DataLayoutIndexed dataLayoutIndex(m_Data.m_Parameters.m_DataLayout);
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight,
+                                                                   m_Data.m_Parameters.m_DataLayout,
+                                                                   permuteBuffer.get());
+
+    // Convert the weights into the compute library format
+    m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+    BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
@@ -84,6 +92,9 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
 
     m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionWorkload", 1, 1);
 
+    INeonTensorHandle* inputTensorHandle  = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0]);
+    INeonTensorHandle* outputTensorHandle = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0]);
+
     arm_compute::ITensor& input  = inputTensorHandle->GetTensor();
     arm_compute::ITensor& output = outputTensorHandle->GetTensor();
 
@@ -91,9 +102,11 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    bool use3x3Optimisation = weightInfo.GetShape()[dataLayoutIndex.GetWidthIndex()] == 3 &&
-                              weightInfo.GetShape()[dataLayoutIndex.GetHeightIndex()] == 3;
+    // Get the depth multiplier
+    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
 
+    // Check for optimisation opportunities.
+    bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3);
     if (use3x3Optimisation)
     {
         m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
@@ -102,7 +115,8 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
                                                            m_KernelTensor.get(),
                                                            m_BiasTensor.get(),
                                                            &output,
-                                                           padStrideInfo);
+                                                           padStrideInfo,
+                                                           depthMultiplier);
     }
     else
     {
@@ -112,12 +126,14 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
                                                            m_KernelTensor.get(),
                                                            m_BiasTensor.get(),
                                                            &output,
-                                                           padStrideInfo);
+                                                           padStrideInfo,
+                                                           depthMultiplier);
     }
 
     BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
 
-    InitializeArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight);
+    ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted);
+    InitializeArmComputeTensorData(*m_KernelTensor, &weightsPermutedHandle);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp
index 704bc36..5c07f57 100644
--- a/src/backends/reference/workloads/ConvImpl.hpp
+++ b/src/backends/reference/workloads/ConvImpl.hpp
@@ -57,7 +57,6 @@ static void ConvImpl(ConvData data,
                      float filterScale,
                      int32_t filterOffset,
                      const BiasType* biasData,
-                     InputType* outputData,
                      float outputScale,
                      int32_t outputOffset,
                      const TensorInfo& filterInfo,
@@ -68,10 +67,10 @@ static void ConvImpl(ConvData data,
         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
     }
 
-    const TensorInfo& inputInfo0  = GetTensorInfo(data.m_Inputs[0]);
-    const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(data.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
 
-    TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
+    TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
                                             GetOutputTensorData<InputType>(0, data),
                                             data.m_Parameters.m_DataLayout);
 
@@ -81,18 +80,18 @@ static void ConvImpl(ConvData data,
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
 
-    unsigned int depthMult      = depthwise ? filterInfo.GetShape()[0] : 1;
-    unsigned int channelsInput  = filterInfo.GetShape()[channelsIndex];
-    unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
+    unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
+    unsigned int inputChannels   = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
+    unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];
 
-    unsigned int batchSize    = outputInfo0.GetShape()[0];
-    unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
-    unsigned int widthOutput  = outputInfo0.GetShape()[widthIndex];
-    unsigned int heightInput  = inputInfo0.GetShape()[heightIndex];
-    unsigned int widthInput   = inputInfo0.GetShape()[widthIndex];
+    unsigned int batchSize    = outputInfo.GetShape()[0];
+    unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
+    unsigned int outputWidth  = outputInfo.GetShape()[widthIndex];
+    unsigned int inputHeight  = inputInfo.GetShape()[heightIndex];
+    unsigned int inputWidth   = inputInfo.GetShape()[widthIndex];
 
-    unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
-    unsigned int widthFilter  = filterInfo.GetShape()[widthIndex];
+    unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
+    unsigned int filterWidth  = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];
 
     unsigned int paddingTop  = data.m_Parameters.m_PadTop;
     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
@@ -102,68 +101,56 @@ static void ConvImpl(ConvData data,
     // The world's least efficient convolution.
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
-        for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
+        for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
         {
-            for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
+            for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
             {
-                for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
+                for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
                 {
                     // This loop goes over each output element.
                     AccumulatorType sum = AccumulatorType();
 
                     // For depthwise, each output channel corresponds to exactly one input channel.
                     // For normal, must loop over each input channel.
-                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
+                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
                     {
                         unsigned int depthwiseMultiplierIdx = 0;
                         if (depthwise)
                         {
-                            cInput = cOutput / depthMult;
-                            depthwiseMultiplierIdx = cOutput % depthMult;
+                            cInput = cOutput / depthMultiplier;
+                            depthwiseMultiplierIdx = cOutput % depthMultiplier;
                         }
 
-                        for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
+                        for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
                         {
-                            for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
+                            for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
                             {
                                 // This loop goes over each input element for each output element.
 
-                                unsigned int filterIndex;
+                                unsigned int filterIndex = 0;
 
                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
                                 if (depthwise)
                                 {
-                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
-                                    {
-                                        filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
-                                                        * channelsInput +
-                                                      yFilter * widthFilter * channelsInput +
-                                                      xFilter * channelsInput +
-                                                      cInput;
-                                    }
-                                    else
-                                    {
-                                        filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
-                                                        * channelsInput +
-                                                      cInput * widthFilter * heightFilter +
-                                                      yFilter * widthFilter +
-                                                      xFilter;
-                                    }
+                                    filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
+                                                  cInput * filterWidth * filterHeight +
+                                                  yFilter * filterWidth +
+                                                  xFilter;
                                 }
                                 else
                                 {
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
-                                        filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
-                                                      yFilter * widthFilter * channelsInput +
-                                                      xFilter * channelsInput +
+                                        filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
+                                                      yFilter * filterWidth * inputChannels +
+                                                      xFilter * inputChannels +
                                                       cInput;
                                     }
                                     else
                                     {
-                                        filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
-                                                      cInput  * widthFilter * heightFilter +
-                                                      yFilter * widthFilter +
+                                        filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
+                                                      cInput  * filterWidth * filterHeight +
+                                                      yFilter * filterWidth +
                                                       xFilter;
                                     }
                                 }
@@ -177,8 +164,8 @@ static void ConvImpl(ConvData data,
                                 AccumulatorType inputValue;
 
                                 // Check if we're in the padding.
-                                if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
-                                    xInput < paddingLeft || xInput >= widthInput + paddingLeft )
+                                if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
+                                    xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
                                 {
                                     inputValue = AccumulatorType();
                                 }
@@ -188,17 +175,17 @@ static void ConvImpl(ConvData data,
 
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
-                                        inputIndex = batchIdx * heightInput * widthInput  * channelsInput +
-                                                     (yInput - paddingTop) * widthInput * channelsInput +
-                                                     (xInput - paddingLeft) * channelsInput +
+                                        inputIndex = batchIdx * inputHeight * inputWidth  * inputChannels +
+                                                     (yInput - paddingTop) * inputWidth * inputChannels +
+                                                     (xInput - paddingLeft) * inputChannels +
                                                      cInput;
 
                                     }
                                     else
                                     {
-                                        inputIndex = batchIdx * widthInput * heightInput * channelsInput +
-                                                     widthInput * heightInput * cInput +
-                                                     widthInput * (yInput - paddingTop) +
+                                        inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
+                                                     inputWidth * inputHeight * cInput +
+                                                     inputWidth * (yInput - paddingTop) +
                                                      xInput - paddingLeft;
                                     }
 
diff --git a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
index 2090564..7b298df 100644
--- a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
@@ -23,15 +23,13 @@ void RefConvolution2dFloat32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvolution2dFloat32Workload_Execute");
 
-    float*       outputData = GetOutputTensorDataFloat(0, m_Data);
     const float* inputData  = GetInputTensorDataFloat(0, m_Data);
-    const float* weightData = m_Weight->template GetConstTensor<float>();
-    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<float>() : nullptr;
+    const float* filterData = m_Weight->template GetConstTensor<float>();
+    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<float>() : nullptr;
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
     ConvImpl<armnn::Convolution2dQueueDescriptor, float, float, float>(
-        m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo);
+        m_Data, inputData, 0.0f, 0, filterData, 0.0f, 0, biasData, 0.0f, 0, filterInfo);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
index 881e9bf..af2c7ad 100644
--- a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
@@ -27,10 +27,7 @@ void RefConvolution2dUint8Workload::Execute() const
     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
     const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
-    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<int32_t>() :
-        nullptr;
-    uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
+    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<int32_t>() : nullptr;
     const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
@@ -39,7 +36,7 @@ void RefConvolution2dUint8Workload::Execute() const
         inputData, inputInfo.GetQuantizationScale(),  inputInfo.GetQuantizationOffset(),
         weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
         biasData,
-        outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
+        outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
index e89013b..756e958 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
@@ -23,15 +23,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dFloat32Workload_Execute");
 
-    float*       outputData = GetOutputTensorDataFloat(0, m_Data);
     const float* inputData  = GetInputTensorDataFloat(0, m_Data);
     const float* weightData = m_Weight->template GetConstTensor<float>();
-    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<float>() : nullptr;
+    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<float>() : nullptr;
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
     ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, float, float, float>
-        (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true);
+        (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, 0.0f, 0, filterInfo, true);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
index e8e501d..629b729 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
@@ -28,10 +28,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const
     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
     const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
-    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<int32_t>() :
-        nullptr;
-    uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
+    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<int32_t>() : nullptr;
     const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
@@ -40,7 +37,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const
         inputData, inputInfo.GetQuantizationScale(),  inputInfo.GetQuantizationOffset(),
         weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
         biasData,
-        outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
+        outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
 }
 
 } //namespace armnn
-- 
2.7.4