2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
19 #define __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
26 #include <type_traits>
28 #include "cker/Types.h"
29 #include "cker/Shape.h"
30 #include "cker/Utils.h"
32 #include "cker/eigen/EigenSupport.h"
33 #include "cker/operation/Helper/PhiloxRandom.h"
42 // Helper function to convert a 16-bit integer to a half between [0..1).
43 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x);
44 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
45 // PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
46 // Helper function to convert a 32-bit integer to a float between [0..1).
47 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x);
48 // Helper function to convert two 32-bit integers to a double between [0..1).
49 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1);
51 // Computes a + b. Requires that the result is representable in the destination
52 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
53 // need *not* be representable in that type. (The condition on b excludes the
54 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
56 template <typename Int>
57 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b)
59 // Implementation note: both b_div_2 and b - b_div_2 are positive and
60 // representable as Int.
61 auto b_div_2 = b >> 1;
62 return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
65 // A class that generates uniform distribution random numbers from the
66 // underlying random integer generator.
68 // Generator: a generator type that returns a number of uint32 upon each
69 // invocation. It needs to define kResultElementCount for the
70 // sample count for each invocation, and ResultType for the
71 // actual returned sample type.
72 // RealType: the data type of the real numbers that will be returned by the
73 // distribution. This could be either float or double for now.
74 // This class is meant to be implemented through specialization. The default
75 // is not defined by design.
76 template <class Generator, typename RealType> class UniformDistribution;
78 template <class Generator> class UniformDistribution<Generator, Eigen::half>
81 // The number of elements that will be returned.
82 static constexpr int kResultElementCount = Generator::kResultElementCount;
83 // Cost of generation of a single element (in cycles).
84 static constexpr int kElementCost = 3;
85 // Indicate that this distribution may take variable number of samples
86 // during the runtime.
87 static constexpr bool kVariableSamplesPerOutput = false;
88 typedef Array<Eigen::half, kResultElementCount> ResultType;
89 typedef Eigen::half ResultElementType;
92 ResultType operator()(Generator *gen)
94 typename Generator::ResultType sample = (*gen)();
96 for (int i = 0; i < kResultElementCount; ++i)
98 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
104 template <class Generator> class UniformDistribution<Generator, float>
107 // The number of elements that will be returned.
108 static constexpr int kResultElementCount = Generator::kResultElementCount;
109 // Cost of generation of a single element (in cycles).
110 static constexpr int kElementCost = 3;
111 // Indicate that this distribution may take variable number of samples
112 // during the runtime.
113 static constexpr bool kVariableSamplesPerOutput = false;
114 typedef Array<float, kResultElementCount> ResultType;
115 typedef float ResultElementType;
118 ResultType operator()(Generator *gen)
120 typename Generator::ResultType sample = (*gen)();
122 for (int i = 0; i < kResultElementCount; ++i)
124 result[i] = Uint32ToFloat(sample[i]);
130 template <class Generator> class UniformDistribution<Generator, double>
133 // The number of elements that will be returned.
134 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
135 // Cost of generation of a single element (in cycles).
136 static constexpr int kElementCost = 3;
137 // Indicate that this distribution may take variable number of samples
138 // during the runtime.
139 static constexpr bool kVariableSamplesPerOutput = false;
140 typedef Array<double, kResultElementCount> ResultType;
141 typedef double ResultElementType;
144 ResultType operator()(Generator *gen)
146 typename Generator::ResultType sample = (*gen)();
148 for (int i = 0; i < kResultElementCount; ++i)
150 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
156 template <class Generator> class UniformDistribution<Generator, int32_t>
159 // The number of elements that will be returned.
160 static constexpr int kResultElementCount = Generator::kResultElementCount;
161 // Cost of generation of a single element (in cycles).
162 static constexpr int kElementCost = 3;
163 // Indicate that this distribution may take variable number of samples
164 // during the runtime.
165 static constexpr bool kVariableSamplesPerOutput = false;
166 typedef Array<int32_t, kResultElementCount> ResultType;
167 typedef int32_t ResultElementType;
170 UniformDistribution(int32_t lo, int32_t hi)
171 : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
176 ResultType operator()(Generator *gen)
178 typename Generator::ResultType sample = (*gen)();
180 for (int i = 0; i < kResultElementCount; ++i)
182 result[i] = SignedAdd(lo_, sample[i] % range_);
188 // Note that lo_ is intentionally signed while range_ is intentionally
189 // unsigned. This is because hi - lo can overflow signed integers if
190 // lo < 0 < hi, but always fits in unsigned.
195 template <class Generator> class UniformDistribution<Generator, int64_t>
198 // The number of elements that will be returned.
199 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
200 // Cost of generation of a single element (in cycles).
201 static constexpr int kElementCost = 3;
202 // Indicate that this distribution may take variable number of samples
203 // during the runtime.
204 static constexpr bool kVariableSamplesPerOutput = false;
205 typedef Array<int64_t, kResultElementCount> ResultType;
206 typedef int64_t ResultElementType;
209 UniformDistribution(int64_t lo, int64_t hi)
210 : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
215 ResultType operator()(Generator *gen)
217 typename Generator::ResultType sample = (*gen)();
219 for (int i = 0; i < kResultElementCount; ++i)
221 auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
222 result[i] = SignedAdd(lo_, bits % range_);
228 // Note that lo_ is intentionally signed while range_ is intentionally
229 // unsigned. This is because hi - lo can overflow signed integers if
230 // lo < 0 < hi, but always fits in unsigned.
235 // Similar to `UniformDistribution`, except that instead of generating numbers
236 // in the range [low, high), it generates numbers covering the whole range of
238 template <typename Generator, typename IntType> class UniformFullIntDistribution;
240 template <typename Generator, typename IntType> class UniformFullIntDistribution32
243 // The number of elements that will be returned.
244 static constexpr int kResultElementCount = Generator::kResultElementCount;
245 // Cost of generation of a single element (in cycles).
246 static constexpr int kElementCost = 3;
247 // Indicate that this distribution may take variable number of samples
248 // during the runtime.
249 static constexpr bool kVariableSamplesPerOutput = false;
250 typedef Array<IntType, kResultElementCount> ResultType;
251 typedef IntType ResultElementType;
254 ResultType operator()(Generator *gen)
256 typename Generator::ResultType sample = (*gen)();
258 for (int i = 0; i < kResultElementCount; ++i)
260 result[i] = sample[i];
266 template <typename Generator, typename IntType> class UniformFullIntDistribution64
269 // The number of elements that will be returned.
270 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
271 // Cost of generation of a single element (in cycles).
272 static constexpr int kElementCost = 3;
273 // Indicate that this distribution may take variable number of samples
274 // during the runtime.
275 static constexpr bool kVariableSamplesPerOutput = false;
276 typedef Array<IntType, kResultElementCount> ResultType;
277 typedef IntType ResultElementType;
280 ResultType operator()(Generator *gen)
282 typename Generator::ResultType sample = (*gen)();
284 for (int i = 0; i < kResultElementCount; ++i)
286 result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
292 template <typename Generator>
293 class UniformFullIntDistribution<Generator, int32_t>
294 : public UniformFullIntDistribution32<Generator, int32_t>
297 template <typename Generator>
298 class UniformFullIntDistribution<Generator, uint32_t>
299 : public UniformFullIntDistribution32<Generator, uint32_t>
302 template <typename Generator>
303 class UniformFullIntDistribution<Generator, int64_t>
304 : public UniformFullIntDistribution64<Generator, int64_t>
307 template <typename Generator>
308 class UniformFullIntDistribution<Generator, uint64_t>
309 : public UniformFullIntDistribution64<Generator, uint64_t>
313 // A class that adapts the underlying native multiple samples to return a single
315 template <class Generator> class SingleSampleAdapter
318 // The number of elements that will be returned.
319 static constexpr int kResultElementCount = 1;
320 // The number of elements that will be returned by the underlying generator.
321 static constexpr int kNativeElementCount = Generator::kResultElementCount;
322 typedef typename Generator::ResultElementType ResultType;
323 typedef typename Generator::ResultElementType ResultElementType;
326 explicit SingleSampleAdapter(Generator *gen)
327 : generator_(gen), used_result_index_(Generator::kResultElementCount)
332 ResultType operator()()
334 if (used_result_index_ == Generator::kResultElementCount)
336 unused_results_ = (*generator_)();
337 used_result_index_ = 0;
340 return unused_results_[used_result_index_++];
344 void Skip(uint64_t num_skips)
350 int num_unused_results = kNativeElementCount - used_result_index_;
351 if (num_skips <= num_unused_results)
353 used_result_index_ += num_skips;
356 num_skips -= num_unused_results;
357 used_result_index_ = kNativeElementCount;
358 SkipFromGenerator(num_skips / kNativeElementCount);
359 num_skips = num_skips % kNativeElementCount;
362 unused_results_ = (*generator_)();
363 used_result_index_ = num_skips;
368 // This implementation iteratively skips over `num_skips` samples
369 // from `generator_`. There is an O(1) implementation for PhiloxRandom
370 // in random_distributions.cc.
372 void SkipFromGenerator(uint64_t num_skips)
380 Generator *generator_;
381 typename Generator::ResultType unused_results_;
382 int used_result_index_;
385 // A class that generates unit normal distribution random numbers from the
386 // underlying random integer generator.
388 // Generator: a generator type that returns a number of uint32 upon each
389 // each invocation. It needs to define kResultElementCount for the
390 // sample count for each invocation, and ResultType for actual
391 // returned sample type.
392 // RealType: the data type of the real numbers that will be returned by the
393 // distribution. This could be either float or double for now.
394 // This class is meant to be implemented through specialization. The default
395 // is not defined by design.
396 template <class Generator, typename RealType> class NormalDistribution;
399 void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1);
402 void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1);
404 // Exactly like the float version, except that we convert to half afterwards;
405 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
406 // gain from working in half internally.
407 template <class Generator> class NormalDistribution<Generator, Eigen::half>
410 // The number of elements that will be returned.
411 static constexpr int kResultElementCount = Generator::kResultElementCount;
412 // Cost of generation of a single element (in cycles).
413 static constexpr int kElementCost = 70;
414 // Indicate that this distribution may take variable number of samples
415 // during the runtime.
416 static constexpr bool kVariableSamplesPerOutput = false;
417 typedef Array<Eigen::half, kResultElementCount> ResultType;
418 typedef Eigen::half ResultElementType;
421 ResultType operator()(Generator *gen)
423 typename Generator::ResultType sample = (*gen)();
425 for (int i = 0; i < kResultElementCount; i += 2)
428 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
429 result[i] = Eigen::half(f[0]);
430 result[i + 1] = Eigen::half(f[1]);
436 template <class Generator> class NormalDistribution<Generator, float>
439 // The number of elements that will be returned.
440 static constexpr int kResultElementCount = Generator::kResultElementCount;
441 // Cost of generation of a single element (in cycles).
442 static constexpr int kElementCost = 70;
443 // Indicate that this distribution may take variable number of samples
444 // during the runtime.
445 static constexpr bool kVariableSamplesPerOutput = false;
446 typedef Array<float, kResultElementCount> ResultType;
447 typedef float ResultElementType;
450 ResultType operator()(Generator *gen)
452 typename Generator::ResultType sample = (*gen)();
454 for (int i = 0; i < kResultElementCount; i += 2)
456 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
462 template <class Generator> class NormalDistribution<Generator, double>
465 // The number of elements that will be returned.
466 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
467 // Cost of generation of a single element (in cycles).
468 static constexpr int kElementCost = 70;
469 // Indicate that this distribution may take variable number of samples
470 // during the runtime.
471 static constexpr bool kVariableSamplesPerOutput = false;
472 typedef Array<double, kResultElementCount> ResultType;
473 typedef double ResultElementType;
476 ResultType operator()(Generator *gen)
478 typename Generator::ResultType sample = (*gen)();
480 for (int i = 0; i < kResultElementCount; i += 2)
482 const int i2 = 2 * i;
483 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
490 // A class that returns standard normal distribution between
491 // [-kTruncateValue, kTruncateValue].
493 // Generator: a generator type that returns a number of uint32 upon each
494 // each invocation. It needs to define kResultElementCount for the
495 // sample count for each invocation, and ResultType for actual
496 // returned sample type.
497 // RealType: the data type of the real numbers that will be returned by the
498 // distribution. This could be either float or double for now.
499 // This class is meant to be implemented through specialization. The default
500 // is not defined by design.
501 template <class SingleSampleGenerator, typename RealType> class TruncatedNormalDistribution;
503 // Exactly like the float version, except that we convert to half afterwards;
504 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
505 // gain from working in half internally.
506 template <class SingleSampleGenerator>
507 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half>
510 // The number of elements that will be returned.
511 static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
512 // Cost of generation of a single element (in cycles).
513 static constexpr int kElementCost = 90;
514 // Indicate that this distribution may take variable number of samples
515 // during the runtime.
516 static constexpr bool kVariableSamplesPerOutput = true;
517 // The threshold where the normal distribution is truncated.
518 const float kTruncateValue = 2.0f;
520 typedef Array<Eigen::half, kResultElementCount> ResultType;
521 typedef Eigen::half ResultElementType;
524 ResultType operator()(SingleSampleGenerator *gen)
530 // Repeatedly take samples from the normal distribution, until we have
531 // the desired number of elements that fall within the pre-defined cutoff
533 const uint32_t x0 = (*gen)();
534 const uint32_t x1 = (*gen)();
536 BoxMullerFloat(x0, x1, &f[0], &f[1]);
538 if (Eigen::numext::abs(f[0]) < kTruncateValue)
540 results[index++] = Eigen::half(f[0]);
541 if (index >= kResultElementCount)
546 if (Eigen::numext::abs(f[1]) < kTruncateValue)
548 results[index++] = Eigen::half(f[1]);
549 if (index >= kResultElementCount)
558 // Partial specialization for float.
559 template <class SingleSampleGenerator>
560 class TruncatedNormalDistribution<SingleSampleGenerator, float>
563 // The number of elements that will be returned.
564 static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
565 // Cost of generation of a single element (in cycles).
566 static constexpr int kElementCost = 90;
567 // Indicate that this distribution may take variable number of samples
568 // during the runtime.
569 static constexpr bool kVariableSamplesPerOutput = true;
570 // The threshold where the normal distribution is truncated.
571 const float kTruncateValue = 2.0f;
573 typedef Array<float, kResultElementCount> ResultType;
574 typedef float ResultElementType;
577 ResultType operator()(SingleSampleGenerator *gen)
583 // Repeatedly take samples from the normal distribution, until we have
584 // the desired number of elements that fall within the pre-defined cutoff
586 const uint32_t x0 = (*gen)();
587 const uint32_t x1 = (*gen)();
589 BoxMullerFloat(x0, x1, &f[0], &f[1]);
591 if (Eigen::numext::abs(f[0]) < kTruncateValue)
593 results[index++] = f[0];
594 if (index >= kResultElementCount)
599 if (Eigen::numext::abs(f[1]) < kTruncateValue)
601 results[index++] = f[1];
602 if (index >= kResultElementCount)
611 // Partial specialization for double.
612 template <class SingleSampleGenerator>
613 class TruncatedNormalDistribution<SingleSampleGenerator, double>
616 // The number of elements that will be returned.
617 static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
618 ? SingleSampleGenerator::kNativeElementCount / 2
620 // Cost of generation of a single element (in cycles).
621 static constexpr int kElementCost = 90;
622 // Indicate that this distribution may take variable number of samples
623 // during the runtime.
624 static constexpr bool kVariableSamplesPerOutput = true;
625 typedef Array<double, kResultElementCount> ResultType;
626 typedef double ResultElementType;
627 const double kTruncateValue = 2.0;
630 ResultType operator()(SingleSampleGenerator *gen)
636 const uint32_t x0 = (*gen)();
637 const uint32_t x1 = (*gen)();
638 const uint32_t x2 = (*gen)();
639 const uint32_t x3 = (*gen)();
641 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
643 if (Eigen::numext::abs(d[0]) < kTruncateValue)
645 results[index++] = d[0];
646 if (index >= kResultElementCount)
651 if (Eigen::numext::abs(d[1]) < kTruncateValue)
653 results[index++] = d[1];
654 if (index >= kResultElementCount)
663 // Helper function to convert two 32-bit uniform integers to two floats
664 // under the unit normal distribution.
666 void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1)
668 // This function implements the Box-Muller transform:
669 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
670 // Do not send a really small number to log().
671 // We cannot mark "epsilon" as "static const" because NVCC would complain
672 const float epsilon = 1.0e-7f;
673 float u1 = Uint32ToFloat(x0);
678 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
679 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
680 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
681 *f0 = Eigen::numext::sin(v1);
682 *f1 = Eigen::numext::cos(v1);
690 // Helper function to convert four 32-bit uniform integers to two doubles
691 // under the unit normal distribution.
693 void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1)
695 // This function implements the Box-Muller transform:
696 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
697 // Do not send a really small number to log().
698 // We cannot mark "epsilon" as "static const" because NVCC would complain
699 const double epsilon = 1.0e-7;
700 double u1 = Uint64ToDouble(x0, x1);
705 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
706 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
707 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
708 *d0 = Eigen::numext::sin(v1);
709 *d1 = Eigen::numext::cos(v1);
717 // Helper function to convert an 16-bit integer to a half between [0..1).
718 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x)
720 // IEEE754 halfs are formatted as follows (MSB first):
721 // sign(1) exponent(5) mantissa(10)
722 // Conceptually construct the following:
724 // exponent == 15 -- an excess 15 representation of a zero exponent
725 // mantissa == 10 random bits
726 const uint16_t man = x & 0x3ffu; // 10 bit mantissa
727 const uint16_t exp = static_cast<uint16_t>(15);
728 const uint16_t val = (exp << 10) | man;
732 return result - Eigen::half(1.0);
735 // Helper function to convert an 32-bit integer to a float between [0..1).
736 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x)
738 // IEEE754 floats are formatted as follows (MSB first):
739 // sign(1) exponent(8) mantissa(23)
740 // Conceptually construct the following:
742 // exponent == 127 -- an excess 127 representation of a zero exponent
743 // mantissa == 23 random bits
744 const uint32_t man = x & 0x7fffffu; // 23 bit mantissa
745 const uint32_t exp = static_cast<uint32_t>(127);
746 const uint32_t val = (exp << 23) | man;
748 // Assumes that endian-ness is same for float and uint32.
750 memcpy(&result, &val, sizeof(val));
751 return result - 1.0f;
754 // Helper function to convert two 32-bit integers to a double between [0..1).
755 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
757 // IEEE754 doubles are formatted as follows (MSB first):
758 // sign(1) exponent(11) mantissa(52)
759 // Conceptually construct the following:
761 // exponent == 1023 -- an excess 1023 representation of a zero exponent
762 // mantissa == 52 random bits
763 const uint32_t mhi = x0 & 0xfffffu; // upper 20 bits of mantissa
764 const uint32_t mlo = x1; // lower 32 bits of mantissa
765 const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo; // mantissa
766 const uint64_t exp = static_cast<uint64_t>(1023);
767 const uint64_t val = (exp << 52) | man;
768 // Assumes that endian-ness is same for double and uint64.
770 memcpy(&result, &val, sizeof(val));
774 } // namespace random
775 } // namespace tensorflow
778 #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__