BOOST_AUTO_TEST_CASE(BFloatType)
{
- armnn::BFloat16 a = 16256;
+ uint16_t v = 16256;
+ armnn::BFloat16 a(v);
+ armnn::BFloat16 b(1.0f);
+ armnn::BFloat16 zero;
// Test BFloat16 type
BOOST_CHECK_EQUAL(sizeof(a), 2);
+ BOOST_CHECK_EQUAL(a, b);
+ BOOST_CHECK_EQUAL(a.val(), v);
+ BOOST_CHECK_EQUAL(a, 1.0f);
+ BOOST_CHECK_EQUAL(zero, 0.0f);
+
+ // Infinity
+ float infFloat = std::numeric_limits<float>::infinity();
+ armnn::BFloat16 infBF(infFloat);
+ BOOST_CHECK_EQUAL(infBF, armnn::BFloat16::inf());
+
+ // NaN
+ float nan = std::numeric_limits<float>::quiet_NaN();
+ armnn::BFloat16 nanBF(nan);
+ BOOST_CHECK_EQUAL(nanBF, armnn::BFloat16::nan());
// Test utility function returns correct type.
using ResolvedType = armnn::ResolveType<armnn::DataType::BFloat16>;
BOOST_CHECK((GetDataTypeName(armnn::DataType::BFloat16) == std::string("BFloat16")));
}
+BOOST_AUTO_TEST_CASE(Float32ToBFloat16Test)
+{
+ // LSB = 0, R = 0 -> round down
+ armnn::BFloat16 roundDown0 = armnn::BFloat16::float32ToBFloat16(1.704735E38f); // 0x7F004000
+ BOOST_CHECK_EQUAL(roundDown0.val(), 0x7F00);
+ // LSB = 1, R = 0 -> round down
+ armnn::BFloat16 roundDown1 = armnn::BFloat16::float32ToBFloat16(9.18355E-41f); // 0x00010000
+ BOOST_CHECK_EQUAL(roundDown1.val(), 0x0001);
+ // LSB = 0, R = 1 all 0 -> round down
+ armnn::BFloat16 roundDown2 = armnn::BFloat16::float32ToBFloat16(1.14794E-40f); // 0x00014000
+ BOOST_CHECK_EQUAL(roundDown2.val(), 0x0001);
+ // LSB = 1, R = 1 -> round up
+ armnn::BFloat16 roundUp = armnn::BFloat16::float32ToBFloat16(-2.0234377f); // 0xC0018001
+ BOOST_CHECK_EQUAL(roundUp.val(), 0xC002);
+ // LSB = 0, R = 1 -> round up
+ armnn::BFloat16 roundUp1 = armnn::BFloat16::float32ToBFloat16(4.843037E-35f); // 0x0680C000
+ BOOST_CHECK_EQUAL(roundUp1.val(), 0x0681);
+ // Max positive value -> infinity
+ armnn::BFloat16 maxPositive = armnn::BFloat16::float32ToBFloat16(3.4028235E38f); // 0x7F7FFFFF
+ BOOST_CHECK_EQUAL(maxPositive, armnn::BFloat16::inf());
+ // Max negative value -> -infinity
+ armnn::BFloat16 maxNeg = armnn::BFloat16::float32ToBFloat16(-3.4028235E38f); // 0xFF7FFFFF
+ BOOST_CHECK_EQUAL(maxNeg.val(), 0xFF80);
+ // Min positive value
+ armnn::BFloat16 minPositive = armnn::BFloat16::float32ToBFloat16(1.1754942E-38f); // 0x007FFFFF
+ BOOST_CHECK_EQUAL(minPositive.val(), 0x0080);
+ // Min negative value
+ armnn::BFloat16 minNeg = armnn::BFloat16::float32ToBFloat16(-1.1754942E-38f); // 0x807FFFFF
+ BOOST_CHECK_EQUAL(minNeg.val(), 0x8080);
+}
+
+BOOST_AUTO_TEST_CASE(BFloat16ToFloat32Test)
+{
+ armnn::BFloat16 bf0(1.5f);
+ BOOST_CHECK_EQUAL(bf0.toFloat32(), 1.5f);
+ armnn::BFloat16 bf1(-5.525308E-25f);
+ BOOST_CHECK_EQUAL(bf1.toFloat32(), -5.525308E-25f);
+ armnn::BFloat16 bf2(-2.0625f);
+ BOOST_CHECK_EQUAL(bf2.toFloat32(), -2.0625f);
+ uint16_t v = 32639;
+ armnn::BFloat16 bf3(v);
+ BOOST_CHECK_EQUAL(bf3.toFloat32(), 3.3895314E38f);
+ // Infinity
+ BOOST_CHECK_EQUAL(armnn::BFloat16::inf().toFloat32(), std::numeric_limits<float>::infinity());
+ // NaN
+ BOOST_CHECK(std::isnan(armnn::BFloat16::nan().toFloat32()));
+}
+
BOOST_AUTO_TEST_CASE(GraphTopologicalSortSimpleTest)
{
std::map<int, std::vector<int>> graph;
#pragma once
+#include <ostream>
+#include <math.h>
#include <stdint.h>
namespace armnn
{
- using BFloat16 = uint16_t;
+class BFloat16
+{
+public:
+ BFloat16()
+ : value(0)
+ {}
+
+ explicit BFloat16(uint16_t v)
+ : value(v)
+ {}
+
+ explicit BFloat16(float v)
+ {
+ value = float32ToBFloat16(v).val();
+ }
+
+ BFloat16& operator=(float v)
+ {
+ value = float32ToBFloat16(v).val();
+ return *this;
+ }
+
+ bool operator==(const BFloat16& r) const
+ {
+ return value == r.val();
+ }
+
+ bool operator==(const float& r) const
+ {
+ return toFloat32() == r;
+ }
+
+ static BFloat16 float32ToBFloat16(const float v)
+ {
+ if (std::isnan(v))
+ {
+ return nan();
+ }
+ else
+ {
+ // Round value to the nearest even
+ // Float32
+ // S EEEEEEEE MMMMMMLRMMMMMMMMMMMMMMM
+ // BFloat16
+ // S EEEEEEEE MMMMMML
+ // LSB (L): Least significat bit of BFloat16 (last bit of the Mantissa of BFloat16)
+ // R: Rounding bit
+ // LSB = 0, R = 0 -> round down
+ // LSB = 1, R = 0 -> round down
+ // LSB = 0, R = 1, all the rest = 0 -> round down
+ // LSB = 1, R = 1 -> round up
+ // LSB = 0, R = 1 -> round up
+ const uint32_t* u32 = reinterpret_cast<const uint32_t*>(&v);
+ uint16_t u16 = static_cast<uint16_t>(*u32 >> 16u);
+ // Mark the LSB
+ const uint16_t lsb = u16 & 0x0001;
+ // Mark the error to be truncate (the rest of 16 bits of FP32)
+ const uint16_t error = static_cast<const uint16_t>((*u32 & 0x0000FFFF));
+ if ((error > 0x8000 || (error == 0x8000 && lsb == 1)))
+ {
+ u16++;
+ }
+ BFloat16 b(u16);
+ return b;
+ }
+ }
+
+ float toFloat32() const
+ {
+ const uint32_t u32 = static_cast<const uint32_t>(value << 16u);
+ const float* f32 = reinterpret_cast<const float*>(&u32);
+ return *f32;
+ }
+
+ uint16_t val() const
+ {
+ return value;
+ }
+
+ static BFloat16 max()
+ {
+ uint16_t max = 0x7F7F;
+ return BFloat16(max);
+ }
+
+ static BFloat16 nan()
+ {
+ uint16_t nan = 0x7FC0;
+ return BFloat16(nan);
+ }
+
+ static BFloat16 inf()
+ {
+ uint16_t infVal = 0x7F80;
+ return BFloat16(infVal);
+ }
+
+private:
+ uint16_t value;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const BFloat16& b)
+{
+ os << b.toFloat32() << "(0x" << std::hex << b.val() << ")";
+ return os;
+}
+
} //namespace armnn