compute/cker/include/cker/Utils.h

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_UTILS_H__
  19 #define __NNFW_CKER_UTILS_H__
  20
  21 #include "Shape.h"
  22
  23 #include <algorithm>
  24 #include <cstdint>
  25 #include <fixedpoint/fixedpoint.h>
  26
  27 namespace nnfw
  28 {
  29 namespace cker
  30 {
  31
  32 template <typename T>
  33 inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
  34 {
  35   return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
  36 }
  37
  38 inline void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
  39 {
  40   if (double_multiplier == 0.)
  41   {
  42     *quantized_multiplier = 0;
  43     *shift = 0;
  44     return;
  45   }
  46
  47   const double q = std::frexp(double_multiplier, shift);
  48   auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
  49
  50   assert(q_fixed <= (1ll << 31));
  51   if (q_fixed == (1ll << 31))
  52   {
  53     q_fixed /= 2;
  54     ++*shift;
  55   }
  56   assert(q_fixed <= std::numeric_limits<int32_t>::max());
  57   // A shift amount smaller than -31 would cause all bits to be shifted out
  58   // and thus all results would be zero. We implement that instead with
  59   // q_fixed==0, so as to avoid hitting issues with right-shift
  60   // operations with shift amounts greater than 31. Note that this happens
  61   // roughly when abs(double_multiplier) < 2^-31 and the present handling means
  62   // that we're effectively flushing tiny double_multiplier's to zero.
  63   // We could conceivably handle values in the range (roughly) [32, 63]
  64   // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
  65   // the present handling is just doing 'flush denormals to zero'. We could
  66   // reconsider and actually generate nonzero denormals if a need arises.
  67   if (*shift < -31)
  68   {
  69     *shift = 0;
  70     q_fixed = 0;
  71   }
  72   *quantized_multiplier = static_cast<int32_t>(q_fixed);
  73 }
  74
  75 inline void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
  76                                                 int32_t *quantized_multiplier, int *left_shift)
  77 {
  78   assert(double_multiplier < 1.0);
  79   assert(double_multiplier > 0.0);
  80   int shift;
  81   QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
  82   assert(shift <= 0);
  83   *left_shift = shift;
  84 }
  85
  86 inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
  87 {
  88   int left_shift = shift > 0 ? shift : 0;
  89   int right_shift = shift > 0 ? 0 : -shift;
  90   return gemmlowp::RoundingDivideByPOT(
  91       gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
  92       right_shift);
  93 }
  94
  95 inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
  96                                                            int left_shift)
  97 {
  98   return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
  99 }
 100
 101 inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
 102                                                               int32_t quantized_multiplier,
 103                                                               int left_shift)
 104 {
 105   return gemmlowp::RoundingDivideByPOT(
 106       gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 107 }
 108
 109 inline int NodeOffset(int b, int h, int w, int height, int width)
 110 {
 111   return (b * height + h) * width + w;
 112 }
 113
 114 inline int CountLeadingZeros(uint32_t integer_input)
 115 {
 116   const uint32_t one_in_leading_positive = 1U << 31;
 117   int leading_zeros = 0;
 118   while (integer_input < one_in_leading_positive)
 119   {
 120     integer_input <<= 1;
 121     ++leading_zeros;
 122   }
 123   return leading_zeros;
 124 }
 125
 126 inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
 127                                              int32_t *output_inv_sqrt, int *output_shift)
 128 {
 129   assert(input >= 0);
 130   if (input <= 1)
 131   {
 132     // Handle the input value 1 separately to avoid overflow in that case
 133     // in the general computation below (b/143972021). Also handle 0 as if it
 134     // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
 135     // but rare/unrealistic input value. We can expect both to occur in some
 136     // incompletely trained models, but probably not in fully trained models.
 137     *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
 138     *output_shift = 0;
 139     return;
 140   }
 141   assert(input > 1);
 142   *output_shift = 11;
 143   while (input >= (1 << 29))
 144   {
 145     input /= 4;
 146     ++*output_shift;
 147   }
 148   const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
 149   const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
 150   const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
 151   *output_shift -= left_shift_bit_pairs;
 152   input <<= 2 * left_shift_bit_pairs;
 153   assert(input >= (1 << 27));
 154   assert(input < (1 << 29));
 155   using gemmlowp::FixedPoint;
 156   using gemmlowp::Rescale;
 157   using gemmlowp::SaturatingRoundingMultiplyByPOT;
 158   // Using 3 integer bits gives us enough room for the internal arithmetic in
 159   // this Newton-Raphson iteration.
 160   using F3 = FixedPoint<int32_t, 3>;
 161   using F0 = FixedPoint<int32_t, 0>;
 162   const F3 fixedpoint_input = F3::FromRaw(input >> 1);
 163   const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
 164   const F3 fixedpoint_half_three =
 165       GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
 166   // Newton-Raphson iteration
 167   // Naive unoptimized starting guess: x = 1
 168   F3 x = F3::One();
 169   // Naive unoptimized number of iterations: 5
 170   for (int i = 0; i < 5; i++)
 171   {
 172     const F3 x3 = Rescale<3>(x * x * x);
 173     x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
 174   }
 175   const F0 fixedpoint_half_sqrt_2 =
 176       GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
 177   x = x * fixedpoint_half_sqrt_2;
 178   *output_inv_sqrt = x.raw();
 179   if (*output_shift < 0)
 180   {
 181     *output_inv_sqrt <<= -*output_shift;
 182     *output_shift = 0;
 183   }
 184   // Convert right shift (right is positive) to left shift.
 185   *output_shift *= reverse_shift;
 186 }
 187
 188 // Comment from tensorflow lite:
 189 //
 190 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
 191 // BROADCASTING.
 192 //
 193 // NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
 194 // rectangular array of numbers.
 195 //
 196 // NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
 197 // However, as Dims<N> is to be deprecated, this class exists as an adaptor
 198 // to enable simple unoptimized implementations of element-wise broadcasting
 199 // operations.
 200 template <int N> struct NdArrayDesc
 201 {
 202   // The "extent" of each dimension. Indices along dimension d must be in the
 203   // half-open interval [0, extents[d]).
 204   int extents[N];
 205
 206   // The number of *elements* (not bytes) between consecutive indices of each
 207   // dimension.
 208   int strides[N];
 209 };
 210
 211 // Comment from tensorflow lite:
 212 //
 213 // DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
 214 // BROADCASTING.
 215 //
 216 // Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
 217 inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
 218 {
 219   assert(i0 >= 0 && i0 < desc.extents[0]);
 220   assert(i1 >= 0 && i1 < desc.extents[1]);
 221   assert(i2 >= 0 && i2 < desc.extents[2]);
 222   assert(i3 >= 0 && i3 < desc.extents[3]);
 223   return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
 224 }
 225
 226 template <int N> inline int SubscriptToIndexGeneric(const NdArrayDesc<N> *desc, int *iter)
 227 {
 228   int ret_indx = 0;
 229   for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
 230   {
 231     assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
 232     ret_indx += iter[idx] * desc->strides[idx];
 233   }
 234
 235   return ret_indx;
 236 }
 237
 238 // Copies dims to desc, calculating strides.
 239 template <int N> inline void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc<N> *desc_out)
 240 {
 241   int desc_stride = 1;
 242   for (int i = N - 1; i >= 0; --i)
 243   {
 244     desc_out->extents[i] = input_shape.Dims(i);
 245     desc_out->strides[i] = desc_stride;
 246     desc_stride *= input_shape.Dims(i);
 247   }
 248 }
 249
 250 template <int N>
 251 inline void
 252 NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
 253                                     NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
 254 {
 255   assert(desc0_out != nullptr);
 256   assert(desc1_out != nullptr);
 257
 258   auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
 259   auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
 260
 261   // Copy dims to desc, calculating strides.
 262   CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
 263   CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
 264
 265   // Walk over each dimension. If the extents are equal do nothing.
 266   // Otherwise, set the desc with extent 1 to have extent equal to the other and
 267   // stride 0.
 268   for (int i = 0; i < N; ++i)
 269   {
 270     const int extent0 = extended_input0_shape.Dims(i);
 271     const int extent1 = extended_input1_shape.Dims(i);
 272     if (extent0 != extent1)
 273     {
 274       if (extent0 == 1)
 275       {
 276         desc0_out->strides[i] = 0;
 277         desc0_out->extents[i] = extent1;
 278       }
 279       else
 280       {
 281         assert(extent1 == 1);
 282         desc1_out->strides[i] = 0;
 283         desc1_out->extents[i] = extent0;
 284       }
 285     }
 286   }
 287 }
 288
 289 template <int N>
 290 inline void
 291 NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
 292                                     const Shape &input2_shape, NdArrayDesc<N> *desc0_out,
 293                                     NdArrayDesc<N> *desc1_out, NdArrayDesc<N> *desc2_out)
 294 {
 295   assert(desc0_out != nullptr);
 296   assert(desc1_out != nullptr);
 297   assert(desc2_out != nullptr);
 298
 299   auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
 300   auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
 301   auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
 302
 303   // Copy dims to desc, calculating strides.
 304   CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
 305   CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
 306   CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
 307
 308   // Walk over each dimension. If the extents are equal do nothing.
 309   // Otherwise, set the desc with extent 1 to have extent equal to the other and
 310   // stride 0.
 311   for (int i = 0; i < N; ++i)
 312   {
 313     const int extent0 = extended_input0_shape.Dims(i);
 314     const int extent1 = extended_input1_shape.Dims(i);
 315     const int extent2 = extended_input2_shape.Dims(i);
 316
 317     int extent = extent0;
 318     if (extent1 != 1)
 319       extent = extent1;
 320     if (extent2 != 1)
 321       extent = extent2;
 322
 323     assert(extent0 == 1 || extent0 == extent);
 324     assert(extent1 == 1 || extent1 == extent);
 325     assert(extent2 == 1 || extent2 == extent);
 326
 327     if (!(extent0 == extent1 && extent1 == extent2))
 328     {
 329       if (extent0 == 1)
 330       {
 331         desc0_out->strides[i] = 0;
 332         desc0_out->extents[i] = extent;
 333       }
 334       if (extent1 == 1)
 335       {
 336         desc1_out->strides[i] = 0;
 337         desc1_out->extents[i] = extent;
 338       }
 339       if (extent2 == 1)
 340       {
 341         desc2_out->strides[i] = 0;
 342         desc2_out->extents[i] = extent;
 343       }
 344     }
 345   }
 346 }
 347
 348 // Gets next index to iterate through a multidimensional array.
 349 inline bool NextIndex(const int num_dims, const int *dims, int *current)
 350 {
 351   if (num_dims == 0)
 352   {
 353     return false;
 354   }
 355   assert(dims != nullptr);
 356   assert(current != nullptr);
 357   int carry = 1;
 358   for (int idx = num_dims - 1; idx >= 0; --idx)
 359   {
 360     int current_val = current[idx] + carry;
 361     assert(dims[idx] >= current_val);
 362     if (dims[idx] == current_val)
 363     {
 364       current[idx] = 0;
 365     }
 366     else
 367     {
 368       current[idx] = current_val;
 369       carry = 0;
 370       break;
 371     }
 372   }
 373   return (carry == 0);
 374 }
 375
 376 // Gets offset of index if reducing on axis. When reducing, the flattened offset
 377 // will not change, if the input index changes on the given axis. For example,
 378 // if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
 379 // then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
 380 // offset.
 381 // TODO(kanlig): uses Dims to represent dimensions.
 382 inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
 383                                   const int num_axis, const int *axis)
 384 {
 385   if (num_dims == 0)
 386   {
 387     return 0;
 388   }
 389
 390   assert(dims != nullptr);
 391   assert(index != nullptr);
 392
 393   size_t offset = 0;
 394   for (int idx = 0; idx < num_dims; ++idx)
 395   {
 396     // if we need to skip this axis
 397     bool is_axis = false;
 398     if (axis != nullptr)
 399     {
 400       for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
 401       {
 402         if (idx == axis[axis_idx])
 403         {
 404           is_axis = true;
 405           break;
 406         }
 407       }
 408     }
 409     if (!is_axis)
 410     {
 411       offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
 412     }
 413   }
 414   return offset;
 415 }
 416
 417 template <typename T> void optimized_ops_preload_l1_keep(const T *ptr)
 418 {
 419 #ifdef __GNUC__
 420   // builtin offered by GCC-compatible compilers including clang
 421   __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
 422 #else
 423   (void)ptr;
 424 #endif
 425 }
 426
 427 // Writes randomly accessed values from `input` sequentially into `output`.
 428 template <typename T> class SequentialTensorWriter
 429 {
 430 public:
 431   SequentialTensorWriter(const T *input_data, T *output_data)
 432       : input_data_(input_data), output_ptr_(output_data)
 433   {
 434   }
 435
 436   void Write(int position) { *output_ptr_++ = input_data_[position]; }
 437   void WriteN(int position, int len)
 438   {
 439     memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
 440     output_ptr_ += len;
 441   }
 442
 443 private:
 444   const T *input_data_;
 445   T *output_ptr_;
 446 };
 447
 448 } // namespace cker
 449 } // namespace nnfw
 450
 451 #endif // __NNFW_CKER_UTILS_H__