Add 32 to 16 bit float conversion w.r.t rounding mode

[platform/upstream/VK-GL-CTS.git] / framework / delibs / debase / deFloat16.c
diff --git a/framework/delibs/debase/deFloat16.c b/framework/delibs/debase/deFloat16.c

index 6460f05..e8a1057 100644 (file)
--- a/framework/delibs/debase/deFloat16.c
+++ b/framework/delibs/debase/deFloat16.c
@@ -99,6 +99,163 @@ deFloat16 deFloat32To16 (float val32)
         }
  }
  
+/*--------------------------------------------------------------------*//*!
+ * \brief Round the given number `val` to nearest even by discarding
+ *        the last `numBitsToDiscard` bits.
+ * \param val value to round
+ * \param numBitsToDiscard number of (least significant) bits to discard
+ * \return The rounded value with the last `numBitsToDiscard` removed
+ *//*--------------------------------------------------------------------*/
+static deUint32 roundToNearestEven (deUint32 val, const deUint32 numBitsToDiscard)
+{
+       const deUint32  lastBits        = val & ((1 << numBitsToDiscard) - 1);
+       const deUint32  headBit         = val & (1 << (numBitsToDiscard - 1));
+
+       DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32);       /* Make sure no overflow. */
+       val >>= numBitsToDiscard;
+
+       if (headBit == 0)
+       {
+               return val;
+       }
+       else if (headBit == lastBits)
+       {
+               if ((val & 0x1) == 0x1)
+               {
+                       return val + 1;
+               }
+               else
+               {
+                       return val;
+               }
+       }
+       else
+       {
+               return val + 1;
+       }
+}
+
+deFloat16 deFloat32To16Round (float val32, deRoundingMode mode)
+{
+       union
+       {
+               float           f;              /* Interpret as 32-bit float */
+               deUint32        u;              /* Interpret as 32-bit unsigned integer */
+       } x;
+       deUint32        sign;           /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
+       deUint32        exp32;          /* exp32: biased exponent for 32-bit floats */
+       int                     exp16;          /* exp16: biased exponent for 16-bit floats */
+       deUint32        mantissa;
+
+       /* We only support these two rounding modes for now */
+       DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
+
+       x.f                     = val32;
+       sign            = (x.u >> 16u) & 0x00008000u;
+       exp32           = (x.u >> 23u) & 0x000000ffu;
+       exp16           = (int) (exp32) - 127 + 15;     /* 15/127: exponent bias for 16-bit/32-bit floats */
+       mantissa        = x.u & 0x007fffffu;
+
+       /* Case: zero and denormalized floats */
+       if (exp32 == 0)
+       {
+               /* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */
+               return (deFloat16) sign;
+       }
+       /* Case: Inf and NaN */
+       else if (exp32 == 0x000000ffu)
+       {
+               if (mantissa == 0u)
+               {
+                       /* Inf */
+                       return (deFloat16) (sign | 0x7c00u);
+               }
+               else
+               {
+                       /* NaN */
+                       mantissa >>= 13u;       /* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */
+                       /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
+                       return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
+               }
+       }
+       /* The following are cases for normalized floats.
+        *
+        * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
+        *   we can only shift the mantissa further right.
+        *   The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
+        *   Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
+        *   So, we just need to right shift the mantissa -exp16 bits.
+        * * If exp16 is 0, mantissa shifting requirement is similar to the above.
+        * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
+        */
+       /* Case: normalized floats -> zero */
+       else if (exp16 < -10)
+       {
+               /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
+               /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
+               return (deFloat16) sign;
+       }
+       /* Case: normalized floats -> zero and denormalized halfs */
+       else if (exp16 <= 0)
+       {
+               /* Add the implicit leading 1 in mormalized float to mantissa. */
+               mantissa |= 0x00800000u;
+               /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
+                * Need to discard the last 14-bits considering rounding mode.
+                * We also need to shift right -exp16 bits to encode the underflowed exponent.
+                */
+               if (mode == DE_ROUNDINGMODE_TO_ZERO)
+               {
+                       mantissa >>= (14 - exp16);
+               }
+               else
+               {
+                       /* mantissa in the above may exceed 10-bits, in which case overflow happens.
+                        * The overflowed bit is automatically carried to exponent then.
+                        */
+                       mantissa = roundToNearestEven(mantissa, 14 - exp16);
+               }
+               return (deFloat16) (sign | mantissa);
+       }
+       /* Case: normalized floats -> normalized floats */
+       else if (exp16 <= 30)
+       {
+               if (mode == DE_ROUNDINGMODE_TO_ZERO)
+               {
+                       return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 13u));
+               }
+               else
+               {
+                       mantissa        = roundToNearestEven(mantissa, 13);
+                       /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
+                       exp16           = (exp16 << 10u) + (mantissa & (1 << 10));
+                       mantissa        &= (1u << 10) - 1;
+                       return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
+               }
+       }
+       /* Case: normalized floats (too large to be representable as 16-bit floats) */
+       else
+       {
+               /* According to IEEE Std 754-2008 Section 7.4,
+                * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
+                *   of the intermediate  result.
+                * * roundTowardZero carries all overflows to the format’s largest finite number
+                *   with the sign of the intermediate result.
+                */
+               if (mode == DE_ROUNDINGMODE_TO_ZERO)
+               {
+                       return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
+               }
+               else
+               {
+                       return (deFloat16) (sign | (0x1f << 10));
+               }
+       }
+
+       /* Make compiler happy */
+       return (deFloat16) 0;
+}
+
  float deFloat16To32 (deFloat16 val16)
  {
         deUint32 sign;