framework/common/tcuFloatFormat.cpp

   1 /*-------------------------------------------------------------------------
   2  * drawElements Quality Program Tester Core
   3  * ----------------------------------------
   4  *
   5  * Copyright 2014 The Android Open Source Project
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *      http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  *
  19  *//*!
  20  * \file
  21  * \brief Adjustable-precision floating point operations.
  22  *//*--------------------------------------------------------------------*/
  23
  24 #include "tcuFloatFormat.hpp"
  25
  26 #include "deMath.h"
  27 #include "deUniquePtr.hpp"
  28
  29 #include <sstream>
  30 #include <iomanip>
  31 #include <limits>
  32
  33 namespace tcu
  34 {
  35 namespace
  36 {
  37
  38 Interval chooseInterval(YesNoMaybe choice, const Interval& no, const Interval& yes)
  39 {
  40         switch (choice)
  41         {
  42                 case NO:        return no;
  43                 case YES:       return yes;
  44                 case MAYBE:     return no | yes;
  45                 default:        DE_FATAL("Impossible case");
  46         }
  47
  48         return Interval();
  49 }
  50
  51 double computeMaxValue (int maxExp, int fractionBits)
  52 {
  53         return (deLdExp(1.0, maxExp) +
  54                         deLdExp(double((1ull << fractionBits) - 1), maxExp - fractionBits));
  55 }
  56
  57 } // anonymous
  58
  59 FloatFormat::FloatFormat (int                   minExp,
  60                                                   int                   maxExp,
  61                                                   int                   fractionBits,
  62                                                   bool                  exactPrecision,
  63                                                   YesNoMaybe    hasSubnormal_,
  64                                                   YesNoMaybe    hasInf_,
  65                                                   YesNoMaybe    hasNaN_)
  66         : m_minExp                      (minExp)
  67         , m_maxExp                      (maxExp)
  68         , m_fractionBits        (fractionBits)
  69         , m_hasSubnormal        (hasSubnormal_)
  70         , m_hasInf                      (hasInf_)
  71         , m_hasNaN                      (hasNaN_)
  72         , m_exactPrecision      (exactPrecision)
  73         , m_maxValue            (computeMaxValue(maxExp, fractionBits))
  74 {
  75         DE_ASSERT(minExp <= maxExp);
  76 }
  77
  78 /*-------------------------------------------------------------------------
  79  * On the definition of ULP
  80  *
  81  * The GLSL spec does not define ULP. However, it refers to IEEE 754, which
  82  * (reportedly) uses Harrison's definition:
  83  *
  84  * ULP(x) is the distance between the closest floating point numbers
  85  * a and be such that a <= x <= b and a != b
  86  *
  87  * Note that this means that when x = 2^n, ULP(x) = 2^(n-p-1), i.e. it is the
  88  * distance to the next lowest float, not next highest.
  89  *
  90  * Furthermore, it is assumed that ULP is calculated relative to the exact
  91  * value, not the approximation. This is because otherwise a less accurate
  92  * approximation could be closer in ULPs, because its ULPs are bigger.
  93  *
  94  * For details, see "On the definition of ulp(x)" by Jean-Michel Muller
  95  *
  96  *-----------------------------------------------------------------------*/
  97
  98 double FloatFormat::ulp (double x, double count) const
  99 {
 100         int                             exp             = 0;
 101         const double    frac    = deFractExp(deAbs(x), &exp);
 102
 103         if (deIsNaN(frac))
 104                 return TCU_NAN;
 105         else if (deIsInf(frac))
 106                 return deLdExp(1.0, m_maxExp - m_fractionBits);
 107         else if (frac == 1.0)
 108         {
 109                 // Harrison's ULP: choose distance to closest (i.e. next lower) at binade
 110                 // boundary.
 111                 --exp;
 112         }
 113         else if (frac == 0.0)
 114                 exp = m_minExp;
 115
 116         // ULP cannot be lower than the smallest quantum.
 117         exp = de::max(exp, m_minExp);
 118
 119         {
 120                 const double            oneULP  = deLdExp(1.0, exp - m_fractionBits);
 121                 ScopedRoundingMode      ctx             (DE_ROUNDINGMODE_TO_POSITIVE_INF);
 122
 123                 return oneULP * count;
 124         }
 125 }
 126
 127 //! Return the difference between the given nominal exponent and
 128 //! the exponent of the lowest significand bit of the
 129 //! representation of a number with this format.
 130 //! For normal numbers this is the number of significand bits, but
 131 //! for subnormals it is less and for values of exp where 2^exp is too
 132 //! small to represent it is <0
 133 int FloatFormat::exponentShift (int exp) const
 134 {
 135         return m_fractionBits - de::max(m_minExp - exp, 0);
 136 }
 137
 138 //! Return the number closest to `d` that is exactly representable with the
 139 //! significand bits and minimum exponent of the floatformat. Round up if
 140 //! `upward` is true, otherwise down.
 141 double FloatFormat::round (double d, bool upward) const
 142 {
 143         int                             exp                     = 0;
 144         const double    frac            = deFractExp(d, &exp);
 145         const int               shift           = exponentShift(exp);
 146         const double    shiftFrac       = deLdExp(frac, shift);
 147         const double    roundFrac       = upward ? deCeil(shiftFrac) : deFloor(shiftFrac);
 148
 149         return deLdExp(roundFrac, exp - shift);
 150 }
 151
 152 //! Return the range of numbers that `d` might be converted to in the
 153 //! floatformat, given its limitations with infinities, subnormals and maximum
 154 //! exponent.
 155 Interval FloatFormat::clampValue (double d) const
 156 {
 157         const double    rSign           = deSign(d);
 158         int                             rExp            = 0;
 159
 160         DE_ASSERT(!deIsNaN(d));
 161
 162         deFractExp(d, &rExp);
 163         if (rExp < m_minExp)
 164                 return chooseInterval(m_hasSubnormal, rSign * 0.0, d);
 165         else if (deIsInf(d) || rExp > m_maxExp)
 166                 return chooseInterval(m_hasInf, rSign * getMaxValue(), rSign * TCU_INFINITY);
 167
 168         return Interval(d);
 169 }
 170
 171 //! Return the range of numbers that might be used with this format to
 172 //! represent a number within `x`.
 173 Interval FloatFormat::convert (const Interval& x) const
 174 {
 175         Interval ret;
 176         Interval tmp = x;
 177
 178         if (x.hasNaN())
 179         {
 180                 // If NaN might be supported, NaN is a legal return value
 181                 if (m_hasNaN != NO)
 182                         ret |= TCU_NAN;
 183
 184                 // If NaN might not be supported, any (non-NaN) value is legal,
 185                 // _subject_ to clamping. Hence we modify tmp, not ret.
 186                 if (m_hasNaN != YES)
 187                         tmp = Interval::unbounded();
 188         }
 189
 190         // Round both bounds _inwards_ to closest representable values.
 191         if (!tmp.empty())
 192                 ret |= clampValue(round(tmp.lo(), true)) | clampValue(round(tmp.hi(), false));
 193
 194         // If this format's precision is not exact, the (possibly out-of-bounds)
 195         // original value is also a possible result.
 196         if (!m_exactPrecision)
 197                 ret |= x;
 198
 199         return ret;
 200 }
 201
 202 double FloatFormat::roundOut (double d, bool upward, bool roundUnderOverflow) const
 203 {
 204         int     exp     = 0;
 205         deFractExp(d, &exp);
 206
 207         if (roundUnderOverflow && exp > m_maxExp && (upward == (d < 0.0)))
 208                 return deSign(d) * getMaxValue();
 209         else
 210                 return round(d, upward);
 211 }
 212
 213 //! Round output of an operation.
 214 //! \param roundUnderOverflow Can +/-inf rounded to min/max representable;
 215 //!                                                       should be false if any of operands was inf, true otherwise.
 216 Interval FloatFormat::roundOut (const Interval& x, bool roundUnderOverflow) const
 217 {
 218         Interval ret = x.nan();
 219
 220         if (!x.empty())
 221                 ret |= Interval(roundOut(x.lo(), false, roundUnderOverflow),
 222                                                 roundOut(x.hi(), true, roundUnderOverflow));
 223
 224         return ret;
 225 }
 226
 227 std::string     FloatFormat::floatToHex (double x) const
 228 {
 229         if (deIsNaN(x))
 230                 return "NaN";
 231         else if (deIsInf(x))
 232                 return (x < 0.0 ? "-" : "+") + std::string("inf");
 233         else if (x == 0.0) // \todo [2014-03-27 lauri] Negative zero
 234                 return "0.0";
 235
 236         int                                     exp                     = 0;
 237         const double            frac            = deFractExp(deAbs(x), &exp);
 238         const int                       shift           = exponentShift(exp);
 239         const deUint64          bits            = deUint64(deLdExp(frac, shift));
 240         const deUint64          whole           = bits >> m_fractionBits;
 241         const deUint64          fraction        = bits & ((deUint64(1) << m_fractionBits) - 1);
 242         const int                       exponent        = exp + m_fractionBits - shift;
 243         const int                       numDigits       = (m_fractionBits + 3) / 4;
 244         const deUint64          aligned         = fraction << (numDigits * 4 - m_fractionBits);
 245         std::ostringstream      oss;
 246
 247         oss << (x < 0 ? "-" : "")
 248                 << "0x" << whole << "."
 249                 << std::hex << std::setw(numDigits) << std::setfill('0') << aligned
 250                 << "p" << std::dec << std::setw(0) << exponent;
 251
 252         return oss.str();
 253 }
 254
 255 std::string FloatFormat::intervalToHex (const Interval& interval) const
 256 {
 257         if (interval.empty())
 258                 return interval.hasNaN() ? "{ NaN }" : "{}";
 259
 260         else if (interval.lo() == interval.hi())
 261                 return (std::string(interval.hasNaN() ? "{ NaN, " : "{ ") +
 262                                 floatToHex(interval.lo()) + " }");
 263         else if (interval == Interval::unbounded(true))
 264                 return "<any>";
 265
 266         return (std::string(interval.hasNaN() ? "{ NaN } | " : "") +
 267                         "[" + floatToHex(interval.lo()) + ", " + floatToHex(interval.hi()) + "]");
 268 }
 269
 270 template <typename T>
 271 static FloatFormat nativeFormat (void)
 272 {
 273         typedef std::numeric_limits<T> Limits;
 274
 275         DE_ASSERT(Limits::radix == 2);
 276
 277         return FloatFormat(Limits::min_exponent - 1,    // These have a built-in offset of one
 278                                            Limits::max_exponent - 1,
 279                                            Limits::digits - 1,                  // don't count the hidden bit
 280                                            Limits::has_denorm != std::denorm_absent,
 281                                            Limits::has_infinity ? YES : NO,
 282                                            Limits::has_quiet_NaN ? YES : NO,
 283                                            ((Limits::has_denorm == std::denorm_present) ? YES :
 284                                                 (Limits::has_denorm == std::denorm_absent) ? NO :
 285                                                 MAYBE));
 286 }
 287
 288 FloatFormat     FloatFormat::nativeFloat (void)
 289 {
 290         return nativeFormat<float>();
 291 }
 292
 293 FloatFormat     FloatFormat::nativeDouble (void)
 294 {
 295         return nativeFormat<double>();
 296 }
 297
 298 namespace
 299 {
 300
 301 using std::string;
 302 using std::ostringstream;
 303 using de::MovePtr;
 304 using de::UniquePtr;
 305
 306 class Test
 307 {
 308 protected:
 309
 310                                                         Test            (MovePtr<FloatFormat> fmt) : m_fmt(fmt) {}
 311         double                                  p                       (int e) const                           { return deLdExp(1.0, e); }
 312         void                                    check           (const string&  expr,
 313                                                                                  double                 result,
 314                                                                                  double                 reference) const;
 315         void                                    testULP         (double arg, double ref) const;
 316         void                                    testRound       (double arg, double refDown, double refUp) const;
 317
 318         UniquePtr<FloatFormat>  m_fmt;
 319 };
 320
 321 void Test::check (const string& expr, double result, double reference) const
 322 {
 323         if (result != reference)
 324         {
 325                 ostringstream oss;
 326                 oss << expr << " returned " << result << ", expected " << reference;
 327                 TCU_FAIL(oss.str().c_str());
 328         }
 329 }
 330
 331 void Test::testULP (double arg, double ref) const
 332 {
 333         ostringstream   oss;
 334
 335         oss << "ulp(" << arg << ")";
 336         check(oss.str(), m_fmt->ulp(arg), ref);
 337 }
 338
 339 void Test::testRound (double arg, double refDown, double refUp) const
 340 {
 341         {
 342                 ostringstream oss;
 343                 oss << "round(" << arg << ", false)";
 344                 check(oss.str(), m_fmt->round(arg, false), refDown);
 345         }
 346         {
 347                 ostringstream oss;
 348                 oss << "round(" << arg << ", true)";
 349                 check(oss.str(), m_fmt->round(arg, true), refUp);
 350         }
 351 }
 352
 353 class TestBinary32 : public Test
 354 {
 355 public:
 356                         TestBinary32 (void)
 357                                 : Test (MovePtr<FloatFormat>(new FloatFormat(-126, 127, 23, true))) {}
 358
 359         void    runTest (void) const;
 360 };
 361
 362 void TestBinary32::runTest (void) const
 363 {
 364         testULP(p(0),                           p(-24));
 365         testULP(p(0) + p(-23),          p(-23));
 366         testULP(p(-124),                        p(-148));
 367         testULP(p(-125),                        p(-149));
 368         testULP(p(-125) + p(-140),      p(-148));
 369         testULP(p(-126),                        p(-149));
 370         testULP(p(-130),                        p(-149));
 371
 372         testRound(p(0) + p(-20) + p(-40),       p(0) + p(-20),          p(0) + p(-20) + p(-23));
 373         testRound(p(-126) - p(-150),            p(-126) - p(-149),      p(-126));
 374
 375         TCU_CHECK(m_fmt->floatToHex(p(0)) == "0x1.000000p0");
 376         TCU_CHECK(m_fmt->floatToHex(p(8) + p(-4)) == "0x1.001000p8");
 377         TCU_CHECK(m_fmt->floatToHex(p(-140)) == "0x0.000400p-126");
 378         TCU_CHECK(m_fmt->floatToHex(p(-140)) == "0x0.000400p-126");
 379         TCU_CHECK(m_fmt->floatToHex(p(-126) + p(-125)) == "0x1.800000p-125");
 380 }
 381
 382 } // anonymous
 383
 384 void FloatFormat_selfTest (void)
 385 {
 386         TestBinary32    test32;
 387         test32.runTest();
 388 }
 389
 390 } // tcu