compute/cker/include/cker/operation/Helper/PhiloxRandom.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
  19 #define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
  20
  21 #include <stdlib.h>
  22
  23 #include "cker/Types.h"
  24 #include "cker/Shape.h"
  25 #include "cker/Utils.h"
  26
  27 // Function qualifiers that need to work on both CPU and GPU.
  28 #if defined(__CUDACC__) || defined(__HIPCC__)
  29 // For nvcc.
  30 #define PHILOX_DEVICE_FUNC __host__ __device__
  31 #define PHILOX_INLINE __inline__
  32 #else
  33 // For non-nvcc.
  34 #define PHILOX_DEVICE_FUNC
  35 #define PHILOX_INLINE inline
  36 #endif
  37 #define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
  38
  39 #include <math.h>
  40
  41 namespace nnfw
  42 {
  43 namespace cker
  44 {
  45 namespace random
  46 {
  47
  48 // A class that represents an inline array. It can be used on both CPU and GPU,
  49 // and also trivially copyable between CPU and GPU.
  50 // Arguments:
  51 //   T: the array element type;
  52 //   ElementCount: the fixed size of the array;
  53 template <typename T, int ElementCount> class Array
  54 {
  55 public:
  56   static constexpr int kElementCount = ElementCount;
  57   PHILOX_DEVICE_INLINE Array()
  58   {
  59     for (int i = 0; i < ElementCount; ++i)
  60     {
  61       data_[i] = T(0);
  62     }
  63   }
  64
  65   PHILOX_DEVICE_INLINE const T &operator[](int index) const { return data_[index]; }
  66
  67   PHILOX_DEVICE_INLINE T &operator[](int index) { return data_[index]; }
  68
  69   size_t size() const { return ElementCount; }
  70
  71 private:
  72   T data_[ElementCount];
  73 };
  74
  75 // A class that encapsulates all the states for a random number generator using
  76 // the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
  77 // in the form of four uint32.
  78 // There are multiple variants of this algorithm, we picked the 4x32_10 version
  79 // that is most suited for our applications.
  80 // Since this class is meant to be copied between CPU to GPU, it maintains a
  81 // value semantics.
  82 //
  83 // For example: To use this class and populate an array of 1024 randoms on CPU
  84 // with two threads,
  85 //
  86 //  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
  87 //    assert(start % 4 == 0);
  88 //    assert(limit % 4 == 0);
  89 //    rnd.Skip(start / 4);
  90 //    for (int i = start; i < limit; i += 4) {
  91 //      auto sample = rnd();
  92 //      ... copy sample[0..3] to output[i..i+3]
  93 //    }
  94 //  }
  95 //
  96 //  PhiloxRandom rng(seed);
  97 //  PhiloxRandom rng_copy = rng;
  98 //  rng.Skip(1000/4);
  99 //
 100 //  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
 101 //  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
 102 //  ... wait for thread 1 & 2 to finish executing Fill().
 103 //
 104 // NOTE:
 105 // 1. PhiloxRandom is trivially copyable.
 106 // 2. PhiloxRandom is compilable by gcc and nvcc.
 107 class PhiloxRandom
 108 {
 109 public:
 110   using ResultType = Array<uint32_t, 4>;
 111   using ResultElementType = uint32_t;
 112   // The number of elements that will be returned.
 113   static constexpr int kResultElementCount = 4;
 114   // Cost of generation of a single element (in cycles).
 115   static constexpr int kElementCost = 10;
 116   // The type for the 64-bit key stored in the form of two 32-bit uint
 117   // that are used in the diffusion process.
 118   using Key = Array<uint32_t, 2>;
 119
 120   PHILOX_DEVICE_INLINE
 121   PhiloxRandom() {}
 122
 123   PHILOX_DEVICE_INLINE
 124   explicit PhiloxRandom(uint64_t seed)
 125   {
 126     key_[0] = static_cast<uint32_t>(seed);
 127     key_[1] = static_cast<uint32_t>(seed >> 32);
 128   }
 129
 130   PHILOX_DEVICE_INLINE
 131   explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi)
 132   {
 133     key_[0] = static_cast<uint32_t>(seed_lo);
 134     key_[1] = static_cast<uint32_t>(seed_lo >> 32);
 135     counter_[2] = static_cast<uint32_t>(seed_hi);
 136     counter_[3] = static_cast<uint32_t>(seed_hi >> 32);
 137   }
 138
 139   PHILOX_DEVICE_INLINE
 140   PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
 141
 142   PHILOX_DEVICE_INLINE
 143   ResultType const &counter() const { return counter_; }
 144
 145   PHILOX_DEVICE_INLINE
 146   Key const &key() const { return key_; }
 147
 148   // Skip the specified number of samples of 128-bits in the current stream.
 149   PHILOX_DEVICE_INLINE
 150   void Skip(uint64_t count)
 151   {
 152     const uint32_t count_lo = static_cast<uint32_t>(count);
 153     uint32_t count_hi = static_cast<uint32_t>(count >> 32);
 154
 155     counter_[0] += count_lo;
 156     if (counter_[0] < count_lo)
 157     {
 158       ++count_hi;
 159     }
 160
 161     counter_[1] += count_hi;
 162     if (counter_[1] < count_hi)
 163     {
 164       if (++counter_[2] == 0)
 165       {
 166         ++counter_[3];
 167       }
 168     }
 169   }
 170
 171   // Returns a group of four random numbers using the underlying Philox
 172   // algorithm.
 173   PHILOX_DEVICE_INLINE ResultType operator()()
 174   {
 175     ResultType counter = counter_;
 176     Key key = key_;
 177
 178     // Run the single rounds for ten times. Manually unrolling the loop
 179     // for better performance.
 180     counter = ComputeSingleRound(counter, key);
 181     RaiseKey(&key);
 182     counter = ComputeSingleRound(counter, key);
 183     RaiseKey(&key);
 184     counter = ComputeSingleRound(counter, key);
 185     RaiseKey(&key);
 186     counter = ComputeSingleRound(counter, key);
 187     RaiseKey(&key);
 188     counter = ComputeSingleRound(counter, key);
 189     RaiseKey(&key);
 190     counter = ComputeSingleRound(counter, key);
 191     RaiseKey(&key);
 192     counter = ComputeSingleRound(counter, key);
 193     RaiseKey(&key);
 194     counter = ComputeSingleRound(counter, key);
 195     RaiseKey(&key);
 196     counter = ComputeSingleRound(counter, key);
 197     RaiseKey(&key);
 198     counter = ComputeSingleRound(counter, key);
 199
 200     SkipOne();
 201
 202     return counter;
 203   }
 204
 205 private:
 206   // We use the same constants as recommended by the original paper.
 207   static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
 208   static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
 209   static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
 210   static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
 211
 212   // Helper function to skip the next sample of 128-bits in the current stream.
 213   PHILOX_DEVICE_INLINE void SkipOne()
 214   {
 215     if (++counter_[0] == 0)
 216     {
 217       if (++counter_[1] == 0)
 218       {
 219         if (++counter_[2] == 0)
 220         {
 221           ++counter_[3];
 222         }
 223       }
 224     }
 225   }
 226
 227   // Helper function to return the lower and higher 32-bits from two 32-bit
 228   // integer multiplications.
 229   PHILOX_DEVICE_INLINE
 230   static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high)
 231   {
 232 #ifndef __CUDA_ARCH__
 233     const uint64_t product = static_cast<uint64_t>(a) * b;
 234     *result_low = static_cast<uint32_t>(product);
 235     *result_high = static_cast<uint32_t>(product >> 32);
 236 #else
 237     *result_low = a * b;
 238     *result_high = __umulhi(a, b);
 239 #endif
 240   }
 241
 242   // Helper function for a single round of the underlying Philox algorithm.
 243   PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(const ResultType &counter,
 244                                                             const Key &key)
 245   {
 246     uint32_t lo0;
 247     uint32_t hi0;
 248     MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
 249
 250     uint32_t lo1;
 251     uint32_t hi1;
 252     MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
 253
 254     ResultType result;
 255     result[0] = hi1 ^ counter[1] ^ key[0];
 256     result[1] = lo1;
 257     result[2] = hi0 ^ counter[3] ^ key[1];
 258     result[3] = lo0;
 259     return result;
 260   }
 261
 262   PHILOX_DEVICE_INLINE void RaiseKey(Key *key)
 263   {
 264     (*key)[0] += kPhiloxW32A;
 265     (*key)[1] += kPhiloxW32B;
 266   }
 267
 268 private:
 269   ResultType counter_;
 270   Key key_;
 271 };
 272
 273 } // namespace random
 274 } // namespace cker
 275 } // namespace nnfw
 276 #endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_