compute/cker/include/cker/operation/Helper/RandomDistributions.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
  19 #define __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
  20
  21 #include <string.h>
  22
  23 #include <cmath>
  24
  25 #include <algorithm>
  26 #include <type_traits>
  27
  28 #include "cker/Types.h"
  29 #include "cker/Shape.h"
  30 #include "cker/Utils.h"
  31
  32 #include "cker/eigen/EigenSupport.h"
  33 #include "cker/operation/Helper/PhiloxRandom.h"
  34
  35 namespace nnfw
  36 {
  37 namespace cker
  38 {
  39 namespace random
  40 {
  41
  42 // Helper function to convert a 16-bit integer to a half between [0..1).
  43 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x);
  44 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
  45 // PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
  46 // Helper function to convert a 32-bit integer to a float between [0..1).
  47 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x);
  48 // Helper function to convert two 32-bit integers to a double between [0..1).
  49 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1);
  50
  51 // Computes a + b. Requires that the result is representable in the destination
  52 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
  53 // need *not* be representable in that type. (The condition on b excludes the
  54 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
  55 // compute.)
  56 template <typename Int>
  57 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b)
  58 {
  59   // Implementation note: both b_div_2 and b - b_div_2 are positive and
  60   // representable as Int.
  61   auto b_div_2 = b >> 1;
  62   return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
  63 }
  64
  65 // A class that generates uniform distribution random numbers from the
  66 // underlying random integer generator.
  67 // Arguments:
  68 //   Generator: a generator type that returns a number of uint32 upon each
  69 //              invocation. It needs to define kResultElementCount for the
  70 //              sample count for each invocation, and ResultType for the
  71 //              actual returned sample type.
  72 //   RealType: the data type of the real numbers that will be returned by the
  73 //             distribution. This could be either float or double for now.
  74 // This class is meant to be implemented through specialization. The default
  75 // is not defined by design.
  76 template <class Generator, typename RealType> class UniformDistribution;
  77
  78 template <class Generator> class UniformDistribution<Generator, Eigen::half>
  79 {
  80 public:
  81   // The number of elements that will be returned.
  82   static constexpr int kResultElementCount = Generator::kResultElementCount;
  83   // Cost of generation of a single element (in cycles).
  84   static constexpr int kElementCost = 3;
  85   // Indicate that this distribution may take variable number of samples
  86   // during the runtime.
  87   static constexpr bool kVariableSamplesPerOutput = false;
  88   typedef Array<Eigen::half, kResultElementCount> ResultType;
  89   typedef Eigen::half ResultElementType;
  90
  91   PHILOX_DEVICE_INLINE
  92   ResultType operator()(Generator *gen)
  93   {
  94     typename Generator::ResultType sample = (*gen)();
  95     ResultType result;
  96     for (int i = 0; i < kResultElementCount; ++i)
  97     {
  98       result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
  99     }
 100     return result;
 101   }
 102 };
 103
 104 template <class Generator> class UniformDistribution<Generator, float>
 105 {
 106 public:
 107   // The number of elements that will be returned.
 108   static constexpr int kResultElementCount = Generator::kResultElementCount;
 109   // Cost of generation of a single element (in cycles).
 110   static constexpr int kElementCost = 3;
 111   // Indicate that this distribution may take variable number of samples
 112   // during the runtime.
 113   static constexpr bool kVariableSamplesPerOutput = false;
 114   typedef Array<float, kResultElementCount> ResultType;
 115   typedef float ResultElementType;
 116
 117   PHILOX_DEVICE_INLINE
 118   ResultType operator()(Generator *gen)
 119   {
 120     typename Generator::ResultType sample = (*gen)();
 121     ResultType result;
 122     for (int i = 0; i < kResultElementCount; ++i)
 123     {
 124       result[i] = Uint32ToFloat(sample[i]);
 125     }
 126     return result;
 127   }
 128 };
 129
 130 template <class Generator> class UniformDistribution<Generator, double>
 131 {
 132 public:
 133   // The number of elements that will be returned.
 134   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
 135   // Cost of generation of a single element (in cycles).
 136   static constexpr int kElementCost = 3;
 137   // Indicate that this distribution may take variable number of samples
 138   // during the runtime.
 139   static constexpr bool kVariableSamplesPerOutput = false;
 140   typedef Array<double, kResultElementCount> ResultType;
 141   typedef double ResultElementType;
 142
 143   PHILOX_DEVICE_INLINE
 144   ResultType operator()(Generator *gen)
 145   {
 146     typename Generator::ResultType sample = (*gen)();
 147     ResultType result;
 148     for (int i = 0; i < kResultElementCount; ++i)
 149     {
 150       result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
 151     }
 152     return result;
 153   }
 154 };
 155
 156 template <class Generator> class UniformDistribution<Generator, int32_t>
 157 {
 158 public:
 159   // The number of elements that will be returned.
 160   static constexpr int kResultElementCount = Generator::kResultElementCount;
 161   // Cost of generation of a single element (in cycles).
 162   static constexpr int kElementCost = 3;
 163   // Indicate that this distribution may take variable number of samples
 164   // during the runtime.
 165   static constexpr bool kVariableSamplesPerOutput = false;
 166   typedef Array<int32_t, kResultElementCount> ResultType;
 167   typedef int32_t ResultElementType;
 168
 169   // Must have lo < hi
 170   UniformDistribution(int32_t lo, int32_t hi)
 171       : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
 172   {
 173   }
 174
 175   PHILOX_DEVICE_INLINE
 176   ResultType operator()(Generator *gen)
 177   {
 178     typename Generator::ResultType sample = (*gen)();
 179     ResultType result;
 180     for (int i = 0; i < kResultElementCount; ++i)
 181     {
 182       result[i] = SignedAdd(lo_, sample[i] % range_);
 183     }
 184     return result;
 185   }
 186
 187 private:
 188   // Note that lo_ is intentionally signed while range_ is intentionally
 189   // unsigned.  This is because hi - lo can overflow signed integers if
 190   // lo < 0 < hi, but always fits in unsigned.
 191   int32_t lo_;
 192   int32_t range_;
 193 };
 194
 195 template <class Generator> class UniformDistribution<Generator, int64_t>
 196 {
 197 public:
 198   // The number of elements that will be returned.
 199   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
 200   // Cost of generation of a single element (in cycles).
 201   static constexpr int kElementCost = 3;
 202   // Indicate that this distribution may take variable number of samples
 203   // during the runtime.
 204   static constexpr bool kVariableSamplesPerOutput = false;
 205   typedef Array<int64_t, kResultElementCount> ResultType;
 206   typedef int64_t ResultElementType;
 207
 208   // Must have lo < hi
 209   UniformDistribution(int64_t lo, int64_t hi)
 210       : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
 211   {
 212   }
 213
 214   PHILOX_DEVICE_INLINE
 215   ResultType operator()(Generator *gen)
 216   {
 217     typename Generator::ResultType sample = (*gen)();
 218     ResultType result;
 219     for (int i = 0; i < kResultElementCount; ++i)
 220     {
 221       auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
 222       result[i] = SignedAdd(lo_, bits % range_);
 223     }
 224     return result;
 225   }
 226
 227 private:
 228   // Note that lo_ is intentionally signed while range_ is intentionally
 229   // unsigned.  This is because hi - lo can overflow signed integers if
 230   // lo < 0 < hi, but always fits in unsigned.
 231   int64_t lo_;
 232   uint64_t range_;
 233 };
 234
 235 // Similar to `UniformDistribution`, except that instead of generating numbers
 236 // in the range [low, high), it generates numbers covering the whole range of
 237 // the integer type.
 238 template <typename Generator, typename IntType> class UniformFullIntDistribution;
 239
 240 template <typename Generator, typename IntType> class UniformFullIntDistribution32
 241 {
 242 public:
 243   // The number of elements that will be returned.
 244   static constexpr int kResultElementCount = Generator::kResultElementCount;
 245   // Cost of generation of a single element (in cycles).
 246   static constexpr int kElementCost = 3;
 247   // Indicate that this distribution may take variable number of samples
 248   // during the runtime.
 249   static constexpr bool kVariableSamplesPerOutput = false;
 250   typedef Array<IntType, kResultElementCount> ResultType;
 251   typedef IntType ResultElementType;
 252
 253   PHILOX_DEVICE_INLINE
 254   ResultType operator()(Generator *gen)
 255   {
 256     typename Generator::ResultType sample = (*gen)();
 257     ResultType result;
 258     for (int i = 0; i < kResultElementCount; ++i)
 259     {
 260       result[i] = sample[i];
 261     }
 262     return result;
 263   }
 264 };
 265
 266 template <typename Generator, typename IntType> class UniformFullIntDistribution64
 267 {
 268 public:
 269   // The number of elements that will be returned.
 270   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
 271   // Cost of generation of a single element (in cycles).
 272   static constexpr int kElementCost = 3;
 273   // Indicate that this distribution may take variable number of samples
 274   // during the runtime.
 275   static constexpr bool kVariableSamplesPerOutput = false;
 276   typedef Array<IntType, kResultElementCount> ResultType;
 277   typedef IntType ResultElementType;
 278
 279   PHILOX_DEVICE_INLINE
 280   ResultType operator()(Generator *gen)
 281   {
 282     typename Generator::ResultType sample = (*gen)();
 283     ResultType result;
 284     for (int i = 0; i < kResultElementCount; ++i)
 285     {
 286       result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
 287     }
 288     return result;
 289   }
 290 };
 291
 292 template <typename Generator>
 293 class UniformFullIntDistribution<Generator, int32_t>
 294     : public UniformFullIntDistribution32<Generator, int32_t>
 295 {
 296 };
 297 template <typename Generator>
 298 class UniformFullIntDistribution<Generator, uint32_t>
 299     : public UniformFullIntDistribution32<Generator, uint32_t>
 300 {
 301 };
 302 template <typename Generator>
 303 class UniformFullIntDistribution<Generator, int64_t>
 304     : public UniformFullIntDistribution64<Generator, int64_t>
 305 {
 306 };
 307 template <typename Generator>
 308 class UniformFullIntDistribution<Generator, uint64_t>
 309     : public UniformFullIntDistribution64<Generator, uint64_t>
 310 {
 311 };
 312
 313 // A class that adapts the underlying native multiple samples to return a single
 314 // sample at a time.
 315 template <class Generator> class SingleSampleAdapter
 316 {
 317 public:
 318   // The number of elements that will be returned.
 319   static constexpr int kResultElementCount = 1;
 320   // The number of elements that will be returned by the underlying generator.
 321   static constexpr int kNativeElementCount = Generator::kResultElementCount;
 322   typedef typename Generator::ResultElementType ResultType;
 323   typedef typename Generator::ResultElementType ResultElementType;
 324
 325   PHILOX_DEVICE_INLINE
 326   explicit SingleSampleAdapter(Generator *gen)
 327       : generator_(gen), used_result_index_(Generator::kResultElementCount)
 328   {
 329   }
 330
 331   PHILOX_DEVICE_INLINE
 332   ResultType operator()()
 333   {
 334     if (used_result_index_ == Generator::kResultElementCount)
 335     {
 336       unused_results_ = (*generator_)();
 337       used_result_index_ = 0;
 338     }
 339
 340     return unused_results_[used_result_index_++];
 341   }
 342
 343   PHILOX_DEVICE_INLINE
 344   void Skip(uint64_t num_skips)
 345   {
 346     if (!num_skips)
 347     {
 348       return;
 349     }
 350     int num_unused_results = kNativeElementCount - used_result_index_;
 351     if (num_skips <= num_unused_results)
 352     {
 353       used_result_index_ += num_skips;
 354       return;
 355     }
 356     num_skips -= num_unused_results;
 357     used_result_index_ = kNativeElementCount;
 358     SkipFromGenerator(num_skips / kNativeElementCount);
 359     num_skips = num_skips % kNativeElementCount;
 360     if (num_skips)
 361     {
 362       unused_results_ = (*generator_)();
 363       used_result_index_ = num_skips;
 364     }
 365   }
 366
 367 private:
 368   // This implementation iteratively skips over `num_skips` samples
 369   // from `generator_`. There is an O(1) implementation for PhiloxRandom
 370   // in random_distributions.cc.
 371   PHILOX_DEVICE_INLINE
 372   void SkipFromGenerator(uint64_t num_skips)
 373   {
 374     while (num_skips--)
 375     {
 376       (*generator_)();
 377     }
 378   }
 379
 380   Generator *generator_;
 381   typename Generator::ResultType unused_results_;
 382   int used_result_index_;
 383 };
 384
 385 // A class that generates unit normal distribution random numbers from the
 386 // underlying random integer generator.
 387 // Arguments:
 388 //   Generator: a generator type that returns a number of uint32 upon each
 389 //              each invocation. It needs to define kResultElementCount for the
 390 //              sample count for each invocation, and ResultType for actual
 391 //              returned sample type.
 392 //   RealType: the data type of the real numbers that will be returned by the
 393 //             distribution. This could be either float or double for now.
 394 // This class is meant to be implemented through specialization. The default
 395 // is not defined by design.
 396 template <class Generator, typename RealType> class NormalDistribution;
 397
 398 PHILOX_DEVICE_INLINE
 399 void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1);
 400
 401 PHILOX_DEVICE_INLINE
 402 void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1);
 403
 404 // Exactly like the float version, except that we convert to half afterwards;
 405 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
 406 // gain from working in half internally.
 407 template <class Generator> class NormalDistribution<Generator, Eigen::half>
 408 {
 409 public:
 410   // The number of elements that will be returned.
 411   static constexpr int kResultElementCount = Generator::kResultElementCount;
 412   // Cost of generation of a single element (in cycles).
 413   static constexpr int kElementCost = 70;
 414   // Indicate that this distribution may take variable number of samples
 415   // during the runtime.
 416   static constexpr bool kVariableSamplesPerOutput = false;
 417   typedef Array<Eigen::half, kResultElementCount> ResultType;
 418   typedef Eigen::half ResultElementType;
 419
 420   PHILOX_DEVICE_INLINE
 421   ResultType operator()(Generator *gen)
 422   {
 423     typename Generator::ResultType sample = (*gen)();
 424     ResultType result;
 425     for (int i = 0; i < kResultElementCount; i += 2)
 426     {
 427       float f[2];
 428       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
 429       result[i] = Eigen::half(f[0]);
 430       result[i + 1] = Eigen::half(f[1]);
 431     }
 432     return result;
 433   }
 434 };
 435
 436 template <class Generator> class NormalDistribution<Generator, float>
 437 {
 438 public:
 439   // The number of elements that will be returned.
 440   static constexpr int kResultElementCount = Generator::kResultElementCount;
 441   // Cost of generation of a single element (in cycles).
 442   static constexpr int kElementCost = 70;
 443   // Indicate that this distribution may take variable number of samples
 444   // during the runtime.
 445   static constexpr bool kVariableSamplesPerOutput = false;
 446   typedef Array<float, kResultElementCount> ResultType;
 447   typedef float ResultElementType;
 448
 449   PHILOX_DEVICE_INLINE
 450   ResultType operator()(Generator *gen)
 451   {
 452     typename Generator::ResultType sample = (*gen)();
 453     ResultType result;
 454     for (int i = 0; i < kResultElementCount; i += 2)
 455     {
 456       BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
 457     }
 458     return result;
 459   }
 460 };
 461
 462 template <class Generator> class NormalDistribution<Generator, double>
 463 {
 464 public:
 465   // The number of elements that will be returned.
 466   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
 467   // Cost of generation of a single element (in cycles).
 468   static constexpr int kElementCost = 70;
 469   // Indicate that this distribution may take variable number of samples
 470   // during the runtime.
 471   static constexpr bool kVariableSamplesPerOutput = false;
 472   typedef Array<double, kResultElementCount> ResultType;
 473   typedef double ResultElementType;
 474
 475   PHILOX_DEVICE_INLINE
 476   ResultType operator()(Generator *gen)
 477   {
 478     typename Generator::ResultType sample = (*gen)();
 479     ResultType result;
 480     for (int i = 0; i < kResultElementCount; i += 2)
 481     {
 482       const int i2 = 2 * i;
 483       BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
 484                       &result[i + 1]);
 485     }
 486     return result;
 487   }
 488 };
 489
 490 // A class that returns standard normal distribution between
 491 // [-kTruncateValue, kTruncateValue].
 492 // Arguments:
 493 //   Generator: a generator type that returns a number of uint32 upon each
 494 //              each invocation. It needs to define kResultElementCount for the
 495 //              sample count for each invocation, and ResultType for actual
 496 //              returned sample type.
 497 //   RealType: the data type of the real numbers that will be returned by the
 498 //             distribution. This could be either float or double for now.
 499 // This class is meant to be implemented through specialization. The default
 500 // is not defined by design.
 501 template <class SingleSampleGenerator, typename RealType> class TruncatedNormalDistribution;
 502
 503 // Exactly like the float version, except that we convert to half afterwards;
 504 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
 505 // gain from working in half internally.
 506 template <class SingleSampleGenerator>
 507 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half>
 508 {
 509 public:
 510   // The number of elements that will be returned.
 511   static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
 512   // Cost of generation of a single element (in cycles).
 513   static constexpr int kElementCost = 90;
 514   // Indicate that this distribution may take variable number of samples
 515   // during the runtime.
 516   static constexpr bool kVariableSamplesPerOutput = true;
 517   // The threshold where the normal distribution is truncated.
 518   const float kTruncateValue = 2.0f;
 519
 520   typedef Array<Eigen::half, kResultElementCount> ResultType;
 521   typedef Eigen::half ResultElementType;
 522
 523   PHILOX_DEVICE_INLINE
 524   ResultType operator()(SingleSampleGenerator *gen)
 525   {
 526     ResultType results;
 527     int index = 0;
 528     while (true)
 529     {
 530       // Repeatedly take samples from the normal distribution, until we have
 531       // the desired number of elements that fall within the pre-defined cutoff
 532       // threshold.
 533       const uint32_t x0 = (*gen)();
 534       const uint32_t x1 = (*gen)();
 535       float f[2];
 536       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 537
 538       if (Eigen::numext::abs(f[0]) < kTruncateValue)
 539       {
 540         results[index++] = Eigen::half(f[0]);
 541         if (index >= kResultElementCount)
 542         {
 543           return results;
 544         }
 545       }
 546       if (Eigen::numext::abs(f[1]) < kTruncateValue)
 547       {
 548         results[index++] = Eigen::half(f[1]);
 549         if (index >= kResultElementCount)
 550         {
 551           return results;
 552         }
 553       }
 554     }
 555   }
 556 };
 557
 558 // Partial specialization for float.
 559 template <class SingleSampleGenerator>
 560 class TruncatedNormalDistribution<SingleSampleGenerator, float>
 561 {
 562 public:
 563   // The number of elements that will be returned.
 564   static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
 565   // Cost of generation of a single element (in cycles).
 566   static constexpr int kElementCost = 90;
 567   // Indicate that this distribution may take variable number of samples
 568   // during the runtime.
 569   static constexpr bool kVariableSamplesPerOutput = true;
 570   // The threshold where the normal distribution is truncated.
 571   const float kTruncateValue = 2.0f;
 572
 573   typedef Array<float, kResultElementCount> ResultType;
 574   typedef float ResultElementType;
 575
 576   PHILOX_DEVICE_INLINE
 577   ResultType operator()(SingleSampleGenerator *gen)
 578   {
 579     ResultType results;
 580     int index = 0;
 581     while (true)
 582     {
 583       // Repeatedly take samples from the normal distribution, until we have
 584       // the desired number of elements that fall within the pre-defined cutoff
 585       // threshold.
 586       const uint32_t x0 = (*gen)();
 587       const uint32_t x1 = (*gen)();
 588       float f[2];
 589       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 590
 591       if (Eigen::numext::abs(f[0]) < kTruncateValue)
 592       {
 593         results[index++] = f[0];
 594         if (index >= kResultElementCount)
 595         {
 596           return results;
 597         }
 598       }
 599       if (Eigen::numext::abs(f[1]) < kTruncateValue)
 600       {
 601         results[index++] = f[1];
 602         if (index >= kResultElementCount)
 603         {
 604           return results;
 605         }
 606       }
 607     }
 608   }
 609 };
 610
 611 // Partial specialization for double.
 612 template <class SingleSampleGenerator>
 613 class TruncatedNormalDistribution<SingleSampleGenerator, double>
 614 {
 615 public:
 616   // The number of elements that will be returned.
 617   static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
 618                                                  ? SingleSampleGenerator::kNativeElementCount / 2
 619                                                  : 1;
 620   // Cost of generation of a single element (in cycles).
 621   static constexpr int kElementCost = 90;
 622   // Indicate that this distribution may take variable number of samples
 623   // during the runtime.
 624   static constexpr bool kVariableSamplesPerOutput = true;
 625   typedef Array<double, kResultElementCount> ResultType;
 626   typedef double ResultElementType;
 627   const double kTruncateValue = 2.0;
 628
 629   PHILOX_DEVICE_INLINE
 630   ResultType operator()(SingleSampleGenerator *gen)
 631   {
 632     ResultType results;
 633     int index = 0;
 634     while (1)
 635     {
 636       const uint32_t x0 = (*gen)();
 637       const uint32_t x1 = (*gen)();
 638       const uint32_t x2 = (*gen)();
 639       const uint32_t x3 = (*gen)();
 640       double d[2];
 641       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
 642
 643       if (Eigen::numext::abs(d[0]) < kTruncateValue)
 644       {
 645         results[index++] = d[0];
 646         if (index >= kResultElementCount)
 647         {
 648           return results;
 649         }
 650       }
 651       if (Eigen::numext::abs(d[1]) < kTruncateValue)
 652       {
 653         results[index++] = d[1];
 654         if (index >= kResultElementCount)
 655         {
 656           return results;
 657         }
 658       }
 659     }
 660   }
 661 };
 662
 663 // Helper function to convert two 32-bit uniform integers to two floats
 664 // under the unit normal distribution.
 665 PHILOX_DEVICE_INLINE
 666 void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1)
 667 {
 668   // This function implements the Box-Muller transform:
 669   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
 670   // Do not send a really small number to log().
 671   // We cannot mark "epsilon" as "static const" because NVCC would complain
 672   const float epsilon = 1.0e-7f;
 673   float u1 = Uint32ToFloat(x0);
 674   if (u1 < epsilon)
 675   {
 676     u1 = epsilon;
 677   }
 678   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
 679   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
 680 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
 681   *f0 = Eigen::numext::sin(v1);
 682   *f1 = Eigen::numext::cos(v1);
 683 #else
 684   sincosf(v1, f0, f1);
 685 #endif
 686   *f0 *= u2;
 687   *f1 *= u2;
 688 }
 689
 690 // Helper function to convert four 32-bit uniform integers to two doubles
 691 // under the unit normal distribution.
 692 PHILOX_DEVICE_INLINE
 693 void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1)
 694 {
 695   // This function implements the Box-Muller transform:
 696   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
 697   // Do not send a really small number to log().
 698   // We cannot mark "epsilon" as "static const" because NVCC would complain
 699   const double epsilon = 1.0e-7;
 700   double u1 = Uint64ToDouble(x0, x1);
 701   if (u1 < epsilon)
 702   {
 703     u1 = epsilon;
 704   }
 705   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
 706   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
 707 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
 708   *d0 = Eigen::numext::sin(v1);
 709   *d1 = Eigen::numext::cos(v1);
 710 #else
 711   sincos(v1, d0, d1);
 712 #endif
 713   *d0 *= u2;
 714   *d1 *= u2;
 715 }
 716
 717 // Helper function to convert an 16-bit integer to a half between [0..1).
 718 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x)
 719 {
 720   // IEEE754 halfs are formatted as follows (MSB first):
 721   //    sign(1) exponent(5) mantissa(10)
 722   // Conceptually construct the following:
 723   //    sign == 0
 724   //    exponent == 15  -- an excess 15 representation of a zero exponent
 725   //    mantissa == 10 random bits
 726   const uint16_t man = x & 0x3ffu; // 10 bit mantissa
 727   const uint16_t exp = static_cast<uint16_t>(15);
 728   const uint16_t val = (exp << 10) | man;
 729
 730   Eigen::half result;
 731   result.x = val;
 732   return result - Eigen::half(1.0);
 733 }
 734
 735 // Helper function to convert an 32-bit integer to a float between [0..1).
 736 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x)
 737 {
 738   // IEEE754 floats are formatted as follows (MSB first):
 739   //    sign(1) exponent(8) mantissa(23)
 740   // Conceptually construct the following:
 741   //    sign == 0
 742   //    exponent == 127  -- an excess 127 representation of a zero exponent
 743   //    mantissa == 23 random bits
 744   const uint32_t man = x & 0x7fffffu; // 23 bit mantissa
 745   const uint32_t exp = static_cast<uint32_t>(127);
 746   const uint32_t val = (exp << 23) | man;
 747
 748   // Assumes that endian-ness is same for float and uint32.
 749   float result;
 750   memcpy(&result, &val, sizeof(val));
 751   return result - 1.0f;
 752 }
 753
 754 // Helper function to convert two 32-bit integers to a double between [0..1).
 755 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
 756 {
 757   // IEEE754 doubles are formatted as follows (MSB first):
 758   //    sign(1) exponent(11) mantissa(52)
 759   // Conceptually construct the following:
 760   //    sign == 0
 761   //    exponent == 1023  -- an excess 1023 representation of a zero exponent
 762   //    mantissa == 52 random bits
 763   const uint32_t mhi = x0 & 0xfffffu;                            // upper 20 bits of mantissa
 764   const uint32_t mlo = x1;                                       // lower 32 bits of mantissa
 765   const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo; // mantissa
 766   const uint64_t exp = static_cast<uint64_t>(1023);
 767   const uint64_t val = (exp << 52) | man;
 768   // Assumes that endian-ness is same for double and uint64.
 769   double result;
 770   memcpy(&result, &val, sizeof(val));
 771   return result - 1.0;
 772 }
 773
 774 } // namespace random
 775 } // namespace tensorflow
 776 }
 777
 778 #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__