return noErrors;
}
+template <typename LayerT>
+LayerT* ConvertBf16ToFp32Weight(Layer* l)
+{
+ LayerT* layer = boost::polymorphic_downcast<LayerT*>(l);
+ if ((layer->GetType() == LayerType::Convolution2d || layer->GetType() == LayerType::FullyConnected)
+ && layer->m_Weight)
+ {
+ const TensorInfo& info = layer->m_Weight->GetTensorInfo();
+
+ if (info.GetDataType() == DataType::BFloat16)
+ {
+ std::vector<float> newValues(info.GetNumElements());
+
+ armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(
+ layer->m_Weight->template GetTensor<armnn::BFloat16>(), info.GetNumElements(), newValues.data());
+
+ TensorInfo newInfo(info.GetShape(), DataType::Float32);
+ ConstTensor newInput(newInfo, newValues);
+ layer->m_Weight.reset(new ScopedCpuTensorHandle(newInput));
+ }
+ }
+ return layer;
+}
+
OptimizationResult AttemptBackendAssignment(BackendSettings& backendSettings,
Graph& graph,
Layer* layer,
{
convertBf16ToFp32Layers =
InsertConvertBf16ToFp32LayersBefore(graph, *layer);
+ if (layer->GetType() == LayerType::Convolution2d)
+ {
+ ConvertBf16ToFp32Weight<Convolution2dLayer>(layer);
+ }
+ else if (layer->GetType() == LayerType::FullyConnected)
+ {
+ ConvertBf16ToFp32Weight<FullyConnectedLayer>(layer);
+ }
}
// Insert FP32 -> BF16 conversion layer after current layer
{
switch(dataType)
{
+ case armnn::DataType::BFloat16:
+ return arm_compute::DataType::BFLOAT16;
case armnn::DataType::Boolean:
return arm_compute::DataType::U8;
case armnn::DataType::Float16:
&TrueFunc<>);
}
+bool NeonLayerSupport::IsConvertBf16ToFp32Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ armnn::IgnoreUnused(input);
+ armnn::IgnoreUnused(output);
+ armnn::IgnoreUnused(reasonIfUnsupported);
+ return true;
+}
+
bool NeonLayerSupport::IsConvertFp16ToFp32Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported) const
return true;
}
+bool NeonLayerSupport::IsConvertFp32ToBf16Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ armnn::IgnoreUnused(input);
+ armnn::IgnoreUnused(output);
+ armnn::IgnoreUnused(reasonIfUnsupported);
+ return true;
+}
+
bool NeonLayerSupport::IsConvertFp32ToFp16Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported) const
bool IsConstantSupported(const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+ bool IsConvertBf16ToFp32Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
bool IsConvertFp16ToFp32Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+ bool IsConvertFp32ToBf16Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
bool IsConvertFp32ToFp16Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
//
#pragma once
+#include <BFloat16.hpp>
#include <Half.hpp>
#include <aclCommon/ArmComputeTensorHandle.hpp>
armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
static_cast<uint8_t*>(memory));
break;
+ case arm_compute::DataType::BFLOAT16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<armnn::BFloat16*>(memory));
+ break;
case arm_compute::DataType::F16:
armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
static_cast<armnn::Half*>(memory));
armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
this->GetTensor());
break;
+ case arm_compute::DataType::BFLOAT16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::BFloat16*>(memory),
+ this->GetTensor());
+ break;
case arm_compute::DataType::F16:
armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
this->GetTensor());
return std::make_unique<NeonConstantWorkload>(descriptor, info);
}
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertBf16ToFp32(
+ const ConvertBf16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertBf16ToFp32Workload>(descriptor, info);
+}
+
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
const ConvertFp16ToFp32QueueDescriptor& descriptor,
const WorkloadInfo& info) const
return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
}
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToBf16(
+ const ConvertFp32ToBf16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertFp32ToBf16Workload>(descriptor, info);
+}
+
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
const ConvertFp32ToFp16QueueDescriptor& descriptor,
const WorkloadInfo& info) const
std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
+ std::unique_ptr<IWorkload> CreateConvertBf16ToFp32(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
+ std::unique_ptr<IWorkload> CreateConvertFp32ToBf16(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
workloads/NeonComparisonWorkload.cpp \
workloads/NeonConcatWorkload.cpp \
workloads/NeonConstantWorkload.cpp \
+ workloads/NeonConvertBf16ToFp32Workload.cpp \
+ workloads/NeonConvertFp32ToBf16Workload.cpp \
workloads/NeonConvertFp16ToFp32Workload.cpp \
workloads/NeonConvertFp32ToFp16Workload.cpp \
workloads/NeonConvolution2dWorkload.cpp \
ARMNN_AUTO_TEST_CASE(ConcatUint8DifferentInputOutputQParam,
ConcatDifferentInputOutputQParamTest<DataType::QAsymmU8>, false)
+// Convert from BFloat16 to Float32
+ARMNN_AUTO_TEST_CASE(ConvertBf16ToFp32, ConvertBf16ToFp32Test)
+
+// Convert from Float32 to BFloat16
+ARMNN_AUTO_TEST_CASE(ConvertFp32ToBf16, ConvertFp32ToBf16Test)
+
// Fully Connected
ARMNN_AUTO_TEST_CASE(SimpleFullyConnected, FullyConnectedFloat32Test, false, false)
ARMNN_AUTO_TEST_CASE(SimpleFullyConnectedWithBias, FullyConnectedFloat32Test, true, false)
NeonConcatWorkload.hpp
NeonConstantWorkload.cpp
NeonConstantWorkload.hpp
+ NeonConvertBf16ToFp32Workload.cpp
+ NeonConvertBf16ToFp32Workload.hpp
NeonConvertFp16ToFp32Workload.cpp
NeonConvertFp16ToFp32Workload.hpp
+ NeonConvertFp32ToBf16Workload.cpp
+ NeonConvertFp32ToBf16Workload.hpp
NeonConvertFp32ToFp16Workload.cpp
NeonConvertFp32ToFp16Workload.hpp
NeonConvolution2dWorkload.cpp
#include "NeonConstantWorkload.hpp"
#include <arm_compute/core/Types.h>
+#include <BFloat16.hpp>
#include <Half.hpp>
#include <aclCommon/ArmComputeTensorUtils.hpp>
#include <neon/NeonTensorHandle.hpp>
switch (computeDataType)
{
+ case arm_compute::DataType::BFLOAT16:
+ {
+ CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<BFloat16>(), output);
+ break;
+ }
case arm_compute::DataType::F16:
{
CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<Half>(), output);
--- /dev/null
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertBf16ToFp32Workload.hpp"
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <BFloat16.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+namespace armnn
+{
+
+NeonConvertBf16ToFp32Workload::NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertBf16ToFp32Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertBf16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertBf16ToFp32Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const BFloat16*>(src);
+ auto output = reinterpret_cast<float*>(dst);
+ size_t numElements = size/2; // 2 bytes per Bf16
+ armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
--- /dev/null
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertBf16ToFp32Workload : public BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>
+{
+public:
+ NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
--- /dev/null
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertFp32ToBf16Workload.hpp"
+
+#include <BFloat16.hpp>
+#include <Profiling.hpp>
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+namespace armnn
+{
+
+NeonConvertFp32ToBf16Workload::NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToBf16Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp32ToBf16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToBf16Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const float*>(src);
+ auto output = reinterpret_cast<BFloat16*>(dst);
+ size_t numElements = size/2; // 2 bytes per bf16
+ armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
--- /dev/null
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertFp32ToBf16Workload : public Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>
+{
+public:
+ NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
#include "NeonBatchToSpaceNdWorkload.hpp"
#include "NeonComparisonWorkload.hpp"
#include "NeonConstantWorkload.hpp"
+#include "NeonConvertBf16ToFp32Workload.hpp"
#include "NeonConvertFp16ToFp32Workload.hpp"
+#include "NeonConvertFp32ToBf16Workload.hpp"
#include "NeonConvertFp32ToFp16Workload.hpp"
#include "NeonConvolution2dWorkload.hpp"
#include "NeonDepthToSpaceWorkload.hpp"