Add 32 to 16 bit float conversion w.r.t rounding mode
authorLei Zhang <antiagainst@google.com>
Mon, 20 Mar 2017 19:40:54 +0000 (15:40 -0400)
committerAlexander Galazin <Alexander.Galazin@arm.com>
Sun, 18 Jun 2017 14:30:45 +0000 (10:30 -0400)
Component: Framework

Change-Id: I9428f0dc725484b8a2f213157100326a69754cd8
(cherry picked from commit 7c0f5bea1da74f70e549e9a735c9d702c9396084)

Android.mk
framework/common/tcuApp.cpp
framework/delibs/debase/CMakeLists.txt
framework/delibs/debase/deFloat16.c
framework/delibs/debase/deFloat16.h
framework/delibs/debase/deFloat16Test.c [new file with mode: 0644]
framework/delibs/debase/deMath.c
framework/delibs/debase/deMath.h
modules/internal/ditDelibsTests.cpp

index 966807c..51a82b4 100644 (file)
@@ -83,6 +83,7 @@ LOCAL_SRC_FILES := \
        framework/common/tcuThreadUtil.cpp \
        framework/delibs/debase/deDefs.c \
        framework/delibs/debase/deFloat16.c \
+       framework/delibs/debase/deFloat16Test.c \
        framework/delibs/debase/deInt32.c \
        framework/delibs/debase/deInt32Test.c \
        framework/delibs/debase/deMath.c \
index 409a677..6227089 100644 (file)
@@ -89,7 +89,7 @@ App::App (Platform& platform, Archive& archive, TestLog& log, const CommandLine&
        print("dEQP Core %s (0x%08x) starting..\n", qpGetReleaseName(), qpGetReleaseId());
        print("  target implementation = '%s'\n", qpGetTargetName());
 
-       if (!deSetRoundingMode(DE_ROUNDINGMODE_TO_NEAREST))
+       if (!deSetRoundingMode(DE_ROUNDINGMODE_TO_NEAREST_EVEN))
                qpPrintf("WARNING: Failed to set floating-point rounding mode!\n");
 
        try
index 2f9d8d0..636d73e 100644 (file)
@@ -9,6 +9,7 @@ set(DEBASE_SRCS
        deDefs.h
        deFloat16.c
        deFloat16.h
+       deFloat16Test.c
        deInt32.c
        deInt32.h
        deInt32Test.c
index 6460f05..e8a1057 100644 (file)
@@ -99,6 +99,163 @@ deFloat16 deFloat32To16 (float val32)
        }
 }
 
+/*--------------------------------------------------------------------*//*!
+ * \brief Round the given number `val` to nearest even by discarding
+ *        the last `numBitsToDiscard` bits.
+ * \param val value to round
+ * \param numBitsToDiscard number of (least significant) bits to discard
+ * \return The rounded value with the last `numBitsToDiscard` removed
+ *//*--------------------------------------------------------------------*/
+static deUint32 roundToNearestEven (deUint32 val, const deUint32 numBitsToDiscard)
+{
+       const deUint32  lastBits        = val & ((1 << numBitsToDiscard) - 1);
+       const deUint32  headBit         = val & (1 << (numBitsToDiscard - 1));
+
+       DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32);       /* Make sure no overflow. */
+       val >>= numBitsToDiscard;
+
+       if (headBit == 0)
+       {
+               return val;
+       }
+       else if (headBit == lastBits)
+       {
+               if ((val & 0x1) == 0x1)
+               {
+                       return val + 1;
+               }
+               else
+               {
+                       return val;
+               }
+       }
+       else
+       {
+               return val + 1;
+       }
+}
+
+deFloat16 deFloat32To16Round (float val32, deRoundingMode mode)
+{
+       union
+       {
+               float           f;              /* Interpret as 32-bit float */
+               deUint32        u;              /* Interpret as 32-bit unsigned integer */
+       } x;
+       deUint32        sign;           /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
+       deUint32        exp32;          /* exp32: biased exponent for 32-bit floats */
+       int                     exp16;          /* exp16: biased exponent for 16-bit floats */
+       deUint32        mantissa;
+
+       /* We only support these two rounding modes for now */
+       DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
+
+       x.f                     = val32;
+       sign            = (x.u >> 16u) & 0x00008000u;
+       exp32           = (x.u >> 23u) & 0x000000ffu;
+       exp16           = (int) (exp32) - 127 + 15;     /* 15/127: exponent bias for 16-bit/32-bit floats */
+       mantissa        = x.u & 0x007fffffu;
+
+       /* Case: zero and denormalized floats */
+       if (exp32 == 0)
+       {
+               /* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */
+               return (deFloat16) sign;
+       }
+       /* Case: Inf and NaN */
+       else if (exp32 == 0x000000ffu)
+       {
+               if (mantissa == 0u)
+               {
+                       /* Inf */
+                       return (deFloat16) (sign | 0x7c00u);
+               }
+               else
+               {
+                       /* NaN */
+                       mantissa >>= 13u;       /* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */
+                       /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
+                       return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
+               }
+       }
+       /* The following are cases for normalized floats.
+        *
+        * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
+        *   we can only shift the mantissa further right.
+        *   The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
+        *   Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
+        *   So, we just need to right shift the mantissa -exp16 bits.
+        * * If exp16 is 0, mantissa shifting requirement is similar to the above.
+        * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
+        */
+       /* Case: normalized floats -> zero */
+       else if (exp16 < -10)
+       {
+               /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
+               /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
+               return (deFloat16) sign;
+       }
+       /* Case: normalized floats -> zero and denormalized halfs */
+       else if (exp16 <= 0)
+       {
+               /* Add the implicit leading 1 in mormalized float to mantissa. */
+               mantissa |= 0x00800000u;
+               /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
+                * Need to discard the last 14-bits considering rounding mode.
+                * We also need to shift right -exp16 bits to encode the underflowed exponent.
+                */
+               if (mode == DE_ROUNDINGMODE_TO_ZERO)
+               {
+                       mantissa >>= (14 - exp16);
+               }
+               else
+               {
+                       /* mantissa in the above may exceed 10-bits, in which case overflow happens.
+                        * The overflowed bit is automatically carried to exponent then.
+                        */
+                       mantissa = roundToNearestEven(mantissa, 14 - exp16);
+               }
+               return (deFloat16) (sign | mantissa);
+       }
+       /* Case: normalized floats -> normalized floats */
+       else if (exp16 <= 30)
+       {
+               if (mode == DE_ROUNDINGMODE_TO_ZERO)
+               {
+                       return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 13u));
+               }
+               else
+               {
+                       mantissa        = roundToNearestEven(mantissa, 13);
+                       /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
+                       exp16           = (exp16 << 10u) + (mantissa & (1 << 10));
+                       mantissa        &= (1u << 10) - 1;
+                       return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
+               }
+       }
+       /* Case: normalized floats (too large to be representable as 16-bit floats) */
+       else
+       {
+               /* According to IEEE Std 754-2008 Section 7.4,
+                * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
+                *   of the intermediate  result.
+                * * roundTowardZero carries all overflows to the format’s largest finite number
+                *   with the sign of the intermediate result.
+                */
+               if (mode == DE_ROUNDINGMODE_TO_ZERO)
+               {
+                       return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
+               }
+               else
+               {
+                       return (deFloat16) (sign | (0x1f << 10));
+               }
+       }
+
+       /* Make compiler happy */
+       return (deFloat16) 0;
+}
+
 float deFloat16To32 (deFloat16 val16)
 {
        deUint32 sign;
index d2d71dc..ab81199 100644 (file)
@@ -24,6 +24,7 @@
  *//*--------------------------------------------------------------------*/
 
 #include "deDefs.h"
+#include "deMath.h"
 
 DE_BEGIN_EXTERN_C
 
@@ -38,7 +39,9 @@ typedef               deFloat16                       DEfloat16;
  * \param val32        Input value.
  * \return Converted 16-bit floating-point value.
  *//*--------------------------------------------------------------------*/
-deFloat16      deFloat32To16           (float val32);
+deFloat16      deFloat32To16                           (float val32);
+deFloat16      deFloat32To16Round                      (float val32, deRoundingMode mode);
+void           deFloat16_selfTest                      (void);
 
 /*--------------------------------------------------------------------*//*!
  * \brief Convert 16-bit floating point number to 32 bit.
diff --git a/framework/delibs/debase/deFloat16Test.c b/framework/delibs/debase/deFloat16Test.c
new file mode 100644 (file)
index 0000000..ea5d217
--- /dev/null
@@ -0,0 +1,335 @@
+/*-------------------------------------------------------------------------
+ * drawElements Base Portability Library
+ * -------------------------------------
+ *
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *//*!
+ * \file
+ * \brief Testing of deFloat16 functions.
+ *//*--------------------------------------------------------------------*/
+
+#include "deFloat16.h"
+#include "deRandom.h"
+
+DE_BEGIN_EXTERN_C
+
+static float getFloat32 (deUint32 sign, deUint32 biased_exponent, deUint32 mantissa)
+{
+       union
+       {
+               float           f;
+               deUint32        u;
+       } x;
+
+       x.u = (sign << 31) | (biased_exponent << 23) | mantissa;
+
+       return x.f;
+}
+
+static deFloat16 getFloat16 (deUint16 sign, deUint16 biased_exponent, deUint16 mantissa)
+{
+       return (deFloat16) ((sign << 15) | (biased_exponent << 10) | mantissa);
+}
+
+
+static deFloat16 deFloat32To16RTZ (float val32)
+{
+       return deFloat32To16Round(val32, DE_ROUNDINGMODE_TO_ZERO);
+}
+
+static deFloat16 deFloat32To16RTE (float val32)
+{
+       return deFloat32To16Round(val32, DE_ROUNDINGMODE_TO_NEAREST_EVEN);
+}
+
+void deFloat16_selfTest (void)
+{
+       /* 16-bit: 1    5 (0x00--0x1f)  10 (0x000--0x3ff)
+        * 32-bit: 1    8 (0x00--0xff)  23 (0x000000--0x7fffff)
+        */
+       deRandom        rnd;
+       int                     idx;
+
+       deRandom_init(&rnd, 0xdeadbeefu-1);
+
+       /* --- For rounding mode RTZ --- */
+
+       /* Zero */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0, 0)) == getFloat16(0, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0, 0)) == getFloat16(1, 0, 0));
+
+       /* Inf */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0)) == getFloat16(0, 0x1f, 0));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0)) == getFloat16(1, 0x1f, 0));
+
+       /* SNaN */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 1)) == getFloat16(0, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 1)) == getFloat16(1, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x3fffff)) == getFloat16(0, 0x1f, 0x1ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x3fffff)) == getFloat16(1, 0x1f, 0x1ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x0003ff)) == getFloat16(0, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x0003ff)) == getFloat16(1, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x123456)) == getFloat16(0, 0x1f, 0x123456 >> 13));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x123456)) == getFloat16(1, 0x1f, 0x123456 >> 13));
+
+       /* QNaN */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x400000)) == getFloat16(0, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x400000)) == getFloat16(1, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x7fffff)) == getFloat16(0, 0x1f, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x7fffff)) == getFloat16(1, 0x1f, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x4003ff)) == getFloat16(0, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x4003ff)) == getFloat16(1, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x723456)) == getFloat16(0, 0x1f, 0x723456 >> 13));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x723456)) == getFloat16(1, 0x1f, 0x723456 >> 13));
+
+       /* Denormalized */
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               mantissa        &= 0x7fffffu;           /* Take the last 23 bits */
+               mantissa        |= (mantissa == 0);     /* Make sure it is not zero */
+
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0, mantissa)) == getFloat16(0, 0, 0));
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0, mantissa)) == getFloat16(1, 0, 0));
+       }
+
+       /* Normalized -> zero */
+       /* Absolute value: minimal 32-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 1, 0)) == getFloat16(0, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 1, 0)) == getFloat16(1, 0, 0));
+       /* Absolute value: 2^-24 - e, extremely near minimal 16-bit denormalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 25, 0x7fffff)) == getFloat16(0, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 25, 0x7fffff)) == getFloat16(1, 0, 0));
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        exponent        = deRandom_getUint32(&rnd);
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               exponent        = exponent % (127 - 25) + 1;    /* Make sure >= 1, <= 127 - 25 */
+               mantissa        &= 0x7fffffu;                                   /* Take the last 23 bits */
+
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0, 0));
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0, 0));
+       }
+
+       /* Normalized -> denormalized */
+       /* Absolute value: 2^-24, minimal 16-bit denormalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 24, 0)) == getFloat16(0, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 24, 0)) == getFloat16(1, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 24, 1)) == getFloat16(0, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 24, 1)) == getFloat16(1, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 20, 0x123456)) == getFloat16(0, 0, 0x12));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 20, 0x123456)) == getFloat16(1, 0, 0x12));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 18, 0x654321)) == getFloat16(0, 0, 0x72));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 18, 0x654321)) == getFloat16(1, 0, 0x72));
+       /* Absolute value: 2^-14 - 2^-24 = (2 - 2^-9) * 2^-15, maximal 16-bit denormalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000)) == getFloat16(0, 0, 0x3ff)); /* 0x7fc000: 0111 1111 1100 0000 0000 0000 */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000)) == getFloat16(1, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000 - 1)) == getFloat16(0, 0, 0x3fe));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000 - 1)) == getFloat16(1, 0, 0x3fe));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000 + 1)) == getFloat16(0, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000 + 1)) == getFloat16(1, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fffff)) == getFloat16(0, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fffff)) == getFloat16(1, 0, 0x3ff));
+
+       /* Normalized -> normalized */
+       /* Absolute value: 2^-14, minimal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 14, 0)) == getFloat16(0, 1, 0));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 14, 0)) == getFloat16(1, 1, 0));
+       /* Absolute value: 65504 - 2^-23, extremely near maximal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(0, 0x1e, 0x3fe));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(1, 0x1e, 0x3fe));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) - 0x456)) == getFloat16(0, 0x1e, 0x3fe));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) - 0x456)) == getFloat16(1, 0x1e, 0x3fe));
+       /* Absolute value: 65504, maximal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, 0x3ff << 13)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, 0x3ff << 13)) == getFloat16(1, 0x1e, 0x3ff));
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        exponent        = deRandom_getUint32(&rnd);
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               exponent        = exponent % ((127 + 14) - (127 -14) + 1) + (127 - 14); /* Make sure >= 127 - 14, <= 127 + 14 */
+               mantissa        &= 0x7fffffu;                                                                                   /* Take the last 23 bits */
+
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, (deUint16) (exponent + 15 - 127), (deUint16) (mantissa >> 13)));
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, (deUint16) (exponent + 15 - 127), (deUint16) (mantissa >> 13)));
+       }
+
+       /* Normalized -> minimal/maximal normalized */
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(1, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) + 0x123)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) + 0x123)) == getFloat16(1, 0x1e, 0x3ff));
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        exponent        = deRandom_getUint32(&rnd);
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               exponent        = exponent % (0xfe - (127 + 16) + 1) + (127 + 16);      /* Make sure >= 127 + 16, <= 0xfe */
+               mantissa        &= 0x7fffffu;                                                                           /* Take the last 23 bits */
+
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0x1e, 0x3ff));
+               DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0x1e, 0x3ff));
+       }
+
+       /* --- For rounding mode RTE --- */
+
+       /* Zero */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0, 0)) == getFloat16(0, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0, 0)) == getFloat16(1, 0, 0));
+
+       /* Inf */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0)) == getFloat16(0, 0x1f, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0)) == getFloat16(1, 0x1f, 0));
+
+       /* SNaN */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 1)) == getFloat16(0, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 1)) == getFloat16(1, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x3fffff)) == getFloat16(0, 0x1f, 0x1ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x3fffff)) == getFloat16(1, 0x1f, 0x1ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x0003ff)) == getFloat16(0, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x0003ff)) == getFloat16(1, 0x1f, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x123456)) == getFloat16(0, 0x1f, 0x123456 >> 13));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x123456)) == getFloat16(1, 0x1f, 0x123456 >> 13));
+
+       /* QNaN */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x400000)) == getFloat16(0, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x400000)) == getFloat16(1, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x7fffff)) == getFloat16(0, 0x1f, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x7fffff)) == getFloat16(1, 0x1f, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x4003ff)) == getFloat16(0, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x4003ff)) == getFloat16(1, 0x1f, 0x200));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x723456)) == getFloat16(0, 0x1f, 0x723456 >> 13));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x723456)) == getFloat16(1, 0x1f, 0x723456 >> 13));
+
+       /* Denormalized */
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               mantissa        &= 0x7fffffu;           /* Take the last 23 bits */
+               mantissa        |= (mantissa == 0);     /* Make sure it is not zero */
+
+               DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0, mantissa)) == getFloat16(0, 0, 0));
+               DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0, mantissa)) == getFloat16(1, 0, 0));
+       }
+
+       /* Normalized -> zero and denormalized */
+       /* Absolute value: minimal 32-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 1, 0)) == getFloat16(0, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 1, 0)) == getFloat16(1, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 42, 0x7abcde)) == getFloat16(0, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 42, 0x7abcde)) == getFloat16(1, 0, 0));
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        exponent        = deRandom_getUint32(&rnd);
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               exponent        = exponent % (127 - 26) + 1;    /* Make sure >= 1, <= 127 - 26 */
+               mantissa        &= 0x7fffffu;                                   /* Take the last 23 bits */
+
+               DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0, 0));
+               DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0, 0));
+       }
+       /* Absolute value: 2^-25, minimal 16-bit denormalized: 2^-24 */
+       /* The following six cases need to right shift mantissa (with leading 1) 10 bits  --------------------> to here */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 0)) == getFloat16(0, 0, 0));    /* XX XXXX XXXX 1 000 0000 0000 0000 0000 0000 */
+       /*                                                                                                      Take the first 10 bits with RTE ------ 00 0000 0000 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 0)) == getFloat16(1, 0, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 1)) == getFloat16(0, 0, 1));    /* XX XXXX XXXX 1 000 0000 0000 0000 0000 0001 */
+       /*                                                                                                      Take the first 10 bits with RTE ------ 00 0000 0001 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 1)) == getFloat16(1, 0, 1));
+       /* Absolute value: 2^-24 - e, extremely near minimal 16-bit denormalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 0x7fffff)) == getFloat16(0, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 0x7fffff)) == getFloat16(1, 0, 1));
+       /* Absolute value: 2^-24, minimal 16-bit denormalized */
+       /* The following (127 - 24) cases need to right shift mantissa (with leading 1) 9 bits  -----------------> to here */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0)) == getFloat16(0, 0, 1));            /* X XXXX XXXX 1 000 0000 0000 0000 0000 0000 */
+       /*                                                                                                      Take the first 10 bits with RTE ---------- 0 0000 0000 1 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0)) == getFloat16(1, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 1)) == getFloat16(0, 0, 1));            /* X XXXX XXXX 1 000 0000 0000 0000 0000 0001 */
+       /*                                                                                                      Take the first 10 bits with RTE ---------- 0 0000 0000 1 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 1)) == getFloat16(1, 0, 1));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x400000)) == getFloat16(0, 0, 2));     /* X XXXX XXXX 1 100 0000 0000 0000 0000 0000 */
+       /*                                                                                                      Take the first 10 bits with RTE ---------- 0 0000 0000 2 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x400000)) == getFloat16(1, 0, 2));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x400001)) == getFloat16(0, 0, 2));     /* X XXXX XXXX 1 100 0000 0000 0000 0000 0001 */
+       /*                                                                                                      Take the first 10 bits with RTE ---------- 0 0000 0000 2 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x400001)) == getFloat16(1, 0, 2));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x4fffff)) == getFloat16(0, 0, 2));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x4fffff)) == getFloat16(1, 0, 2));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 20, 0x123456)) == getFloat16(0, 0, 0x12));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 20, 0x123456)) == getFloat16(1, 0, 0x12));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 18, 0x654321)) == getFloat16(0, 0, 0x73));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 18, 0x654321)) == getFloat16(1, 0, 0x73));
+       /* Absolute value: 2^-14 - 2^-24 = (2 - 2^-9) * 2^-15, maximal 16-bit denormalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000)) == getFloat16(0, 0, 0x3ff)); /* 0x7fc000: 0111 1111 1100 0000 0000 0000 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000)) == getFloat16(1, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000 - 1)) == getFloat16(0, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000 - 1)) == getFloat16(1, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000 + 1)) == getFloat16(0, 0, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000 + 1)) == getFloat16(1, 0, 0x3ff));
+
+       /* Normalized -> normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fe000)) == getFloat16(0, 1, 0));     /* 0x7fe000: 0111 1111 1110 0000 0000 0000 */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fe000)) == getFloat16(1, 1, 0));
+       /* Absolute value: (2 - 2^-23) * 2^-15, extremely near 2^-14, minimal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fffff)) == getFloat16(0, 1, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fffff)) == getFloat16(1, 1, 0));
+       /* Absolute value: 2^-14, minimal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 14, 0)) == getFloat16(0, 1, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 14, 0)) == getFloat16(1, 1, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3fe << 13) + (1 << 12))) == getFloat16(0, 0x1e, 0x3fe));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3fe << 13) + (1 << 12))) == getFloat16(1, 0x1e, 0x3fe));
+
+       /* Normalized -> minimal/maximal normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3fe << 13) + (1 << 12) + 1)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3fe << 13) + (1 << 12) + 1)) == getFloat16(1, 0x1e, 0x3ff));
+       /* Absolute value: 65504 - 2^-23, extremely near maximal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(1, 0x1e, 0x3ff));
+       /* Absolute value: 65504, maximal 16-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, 0x3ff << 13)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, 0x3ff << 13)) == getFloat16(1, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(1, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + 0x456)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + 0x456)) == getFloat16(1, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + (1 << 12) - 1)) == getFloat16(0, 0x1e, 0x3ff));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + (1 << 12) - 1)) == getFloat16(1, 0x1e, 0x3ff));
+
+       /* Normalized -> Inf */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + (1 << 12))) == getFloat16(0, 0x1f, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + (1 << 12))) == getFloat16(1, 0x1f, 0));
+       /* Absolute value: maximal 32-bit normalized */
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, 0x7fffff)) == getFloat16(0, 0x1f, 0));
+       DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, 0x7fffff)) == getFloat16(1, 0x1f, 0));
+       for (idx = 0; idx < 256; ++idx)
+       {
+               deUint32        exponent        = deRandom_getUint32(&rnd);
+               deUint32        mantissa        = deRandom_getUint32(&rnd);
+
+               exponent        = exponent % (0xfe - (127 + 16) + 1) + (127 + 16);      /* Make sure >= 127 + 16, <= 0xfe */
+               mantissa        &= 0x7fffffu;                                                                           /* Take the last 23 bits */
+
+               DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0x1f, 0));
+               DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0x1f, 0));
+       }
+}
+
+DE_END_EXTERN_C
index 26e2aef..c761751 100644 (file)
@@ -46,7 +46,7 @@ deRoundingMode deGetRoundingMode (void)
                case _RC_CHOP:  return DE_ROUNDINGMODE_TO_ZERO;
                case _RC_UP:    return DE_ROUNDINGMODE_TO_POSITIVE_INF;
                case _RC_DOWN:  return DE_ROUNDINGMODE_TO_NEGATIVE_INF;
-               case _RC_NEAR:  return DE_ROUNDINGMODE_TO_NEAREST;
+               case _RC_NEAR:  return DE_ROUNDINGMODE_TO_NEAREST_EVEN;
                default:                return DE_ROUNDINGMODE_LAST;
        }
 #elif (DE_COMPILER == DE_COMPILER_GCC) || (DE_COMPILER == DE_COMPILER_CLANG)
@@ -56,7 +56,7 @@ deRoundingMode deGetRoundingMode (void)
                case FE_TOWARDZERO:     return DE_ROUNDINGMODE_TO_ZERO;
                case FE_UPWARD:         return DE_ROUNDINGMODE_TO_POSITIVE_INF;
                case FE_DOWNWARD:       return DE_ROUNDINGMODE_TO_NEGATIVE_INF;
-               case FE_TONEAREST:      return DE_ROUNDINGMODE_TO_NEAREST;
+               case FE_TONEAREST:      return DE_ROUNDINGMODE_TO_NEAREST_EVEN;
                default:                        return DE_ROUNDINGMODE_LAST;
        }
 #else
@@ -76,7 +76,7 @@ deBool deSetRoundingMode (deRoundingMode mode)
                case DE_ROUNDINGMODE_TO_ZERO:                   flag = _RC_CHOP;        break;
                case DE_ROUNDINGMODE_TO_POSITIVE_INF:   flag = _RC_UP;          break;
                case DE_ROUNDINGMODE_TO_NEGATIVE_INF:   flag = _RC_DOWN;        break;
-               case DE_ROUNDINGMODE_TO_NEAREST:                flag = _RC_NEAR;        break;
+               case DE_ROUNDINGMODE_TO_NEAREST_EVEN:   flag = _RC_NEAR;        break;
                default:
                        DE_ASSERT(DE_FALSE);
        }
@@ -92,7 +92,7 @@ deBool deSetRoundingMode (deRoundingMode mode)
                case DE_ROUNDINGMODE_TO_ZERO:                   flag = FE_TOWARDZERO;   break;
                case DE_ROUNDINGMODE_TO_POSITIVE_INF:   flag = FE_UPWARD;               break;
                case DE_ROUNDINGMODE_TO_NEGATIVE_INF:   flag = FE_DOWNWARD;             break;
-               case DE_ROUNDINGMODE_TO_NEAREST:                flag = FE_TONEAREST;    break;
+               case DE_ROUNDINGMODE_TO_NEAREST_EVEN:   flag = FE_TONEAREST;    break;
                default:
                        DE_ASSERT(DE_FALSE);
        }
index 4ab86fb..61f2d14 100644 (file)
@@ -45,7 +45,7 @@ DE_BEGIN_EXTERN_C
 
 typedef enum deRoundingMode_e
 {
-       DE_ROUNDINGMODE_TO_NEAREST = 0,
+       DE_ROUNDINGMODE_TO_NEAREST_EVEN = 0,
        DE_ROUNDINGMODE_TO_ZERO,
        DE_ROUNDINGMODE_TO_POSITIVE_INF,
        DE_ROUNDINGMODE_TO_NEGATIVE_INF,
index 665cd89..ca2ad8a 100644 (file)
@@ -43,6 +43,7 @@
 
 // debase
 #include "deInt32.h"
+#include "deFloat16.h"
 #include "deMath.h"
 #include "deSha1.h"
 #include "deMemory.h"
@@ -159,10 +160,11 @@ public:
 
        void init (void)
        {
-               addChild(new SelfCheckCase(m_testCtx, "int32",  "deInt32_selfTest()",   deInt32_selfTest));
-               addChild(new SelfCheckCase(m_testCtx, "math",   "deMath_selfTest()",    deMath_selfTest));
-               addChild(new SelfCheckCase(m_testCtx, "sha1",   "deSha1_selfTest()",    deSha1_selfTest));
-               addChild(new SelfCheckCase(m_testCtx, "memory", "deMemory_selfTest()",  deMemory_selfTest));
+               addChild(new SelfCheckCase(m_testCtx, "int32",          "deInt32_selfTest()",   deInt32_selfTest));
+               addChild(new SelfCheckCase(m_testCtx, "float16",        "deFloat16_selfTest()", deFloat16_selfTest));
+               addChild(new SelfCheckCase(m_testCtx, "math",           "deMath_selfTest()",    deMath_selfTest));
+               addChild(new SelfCheckCase(m_testCtx, "sha1",           "deSha1_selfTest()",    deSha1_selfTest));
+               addChild(new SelfCheckCase(m_testCtx, "memory",         "deMemory_selfTest()",  deMemory_selfTest));
        }
 };