2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
19 #define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
23 #include "cker/Types.h"
24 #include "cker/Shape.h"
25 #include "cker/Utils.h"
27 // Function qualifiers that need to work on both CPU and GPU.
28 #if defined(__CUDACC__) || defined(__HIPCC__)
30 #define PHILOX_DEVICE_FUNC __host__ __device__
31 #define PHILOX_INLINE __inline__
34 #define PHILOX_DEVICE_FUNC
35 #define PHILOX_INLINE inline
37 #define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
48 // A class that represents an inline array. It can be used on both CPU and GPU,
49 // and also trivially copyable between CPU and GPU.
51 // T: the array element type;
52 // ElementCount: the fixed size of the array;
53 template <typename T, int ElementCount> class Array
56 static constexpr int kElementCount = ElementCount;
57 PHILOX_DEVICE_INLINE Array()
59 for (int i = 0; i < ElementCount; ++i)
65 PHILOX_DEVICE_INLINE const T &operator[](int index) const { return data_[index]; }
67 PHILOX_DEVICE_INLINE T &operator[](int index) { return data_[index]; }
69 size_t size() const { return ElementCount; }
72 T data_[ElementCount];
75 // A class that encapsulates all the states for a random number generator using
76 // the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
77 // in the form of four uint32.
78 // There are multiple variants of this algorithm, we picked the 4x32_10 version
79 // that is most suited for our applications.
80 // Since this class is meant to be copied between CPU to GPU, it maintains a
83 // For example: To use this class and populate an array of 1024 randoms on CPU
86 // void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
87 // assert(start % 4 == 0);
88 // assert(limit % 4 == 0);
89 // rnd.Skip(start / 4);
90 // for (int i = start; i < limit; i += 4) {
91 // auto sample = rnd();
92 // ... copy sample[0..3] to output[i..i+3]
96 // PhiloxRandom rng(seed);
97 // PhiloxRandom rng_copy = rng;
100 // ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
101 // ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
102 // ... wait for thread 1 & 2 to finish executing Fill().
105 // 1. PhiloxRandom is trivially copyable.
106 // 2. PhiloxRandom is compilable by gcc and nvcc.
110 using ResultType = Array<uint32_t, 4>;
111 using ResultElementType = uint32_t;
112 // The number of elements that will be returned.
113 static constexpr int kResultElementCount = 4;
114 // Cost of generation of a single element (in cycles).
115 static constexpr int kElementCost = 10;
116 // The type for the 64-bit key stored in the form of two 32-bit uint
117 // that are used in the diffusion process.
118 using Key = Array<uint32_t, 2>;
124 explicit PhiloxRandom(uint64_t seed)
126 key_[0] = static_cast<uint32_t>(seed);
127 key_[1] = static_cast<uint32_t>(seed >> 32);
131 explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi)
133 key_[0] = static_cast<uint32_t>(seed_lo);
134 key_[1] = static_cast<uint32_t>(seed_lo >> 32);
135 counter_[2] = static_cast<uint32_t>(seed_hi);
136 counter_[3] = static_cast<uint32_t>(seed_hi >> 32);
140 PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
143 ResultType const &counter() const { return counter_; }
146 Key const &key() const { return key_; }
148 // Skip the specified number of samples of 128-bits in the current stream.
150 void Skip(uint64_t count)
152 const uint32_t count_lo = static_cast<uint32_t>(count);
153 uint32_t count_hi = static_cast<uint32_t>(count >> 32);
155 counter_[0] += count_lo;
156 if (counter_[0] < count_lo)
161 counter_[1] += count_hi;
162 if (counter_[1] < count_hi)
164 if (++counter_[2] == 0)
171 // Returns a group of four random numbers using the underlying Philox
173 PHILOX_DEVICE_INLINE ResultType operator()()
175 ResultType counter = counter_;
178 // Run the single rounds for ten times. Manually unrolling the loop
179 // for better performance.
180 counter = ComputeSingleRound(counter, key);
182 counter = ComputeSingleRound(counter, key);
184 counter = ComputeSingleRound(counter, key);
186 counter = ComputeSingleRound(counter, key);
188 counter = ComputeSingleRound(counter, key);
190 counter = ComputeSingleRound(counter, key);
192 counter = ComputeSingleRound(counter, key);
194 counter = ComputeSingleRound(counter, key);
196 counter = ComputeSingleRound(counter, key);
198 counter = ComputeSingleRound(counter, key);
206 // We use the same constants as recommended by the original paper.
207 static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
208 static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
209 static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
210 static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
212 // Helper function to skip the next sample of 128-bits in the current stream.
213 PHILOX_DEVICE_INLINE void SkipOne()
215 if (++counter_[0] == 0)
217 if (++counter_[1] == 0)
219 if (++counter_[2] == 0)
227 // Helper function to return the lower and higher 32-bits from two 32-bit
228 // integer multiplications.
230 static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high)
232 #ifndef __CUDA_ARCH__
233 const uint64_t product = static_cast<uint64_t>(a) * b;
234 *result_low = static_cast<uint32_t>(product);
235 *result_high = static_cast<uint32_t>(product >> 32);
238 *result_high = __umulhi(a, b);
242 // Helper function for a single round of the underlying Philox algorithm.
243 PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(const ResultType &counter,
248 MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
252 MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
255 result[0] = hi1 ^ counter[1] ^ key[0];
257 result[2] = hi0 ^ counter[3] ^ key[1];
262 PHILOX_DEVICE_INLINE void RaiseKey(Key *key)
264 (*key)[0] += kPhiloxW32A;
265 (*key)[1] += kPhiloxW32B;
273 } // namespace random
276 #endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_