2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_UTILS_H__
19 #define __NNFW_CKER_UTILS_H__
25 #include <fixedpoint/fixedpoint.h>
33 inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
35 return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
38 inline void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
40 if (double_multiplier == 0.)
42 *quantized_multiplier = 0;
47 const double q = std::frexp(double_multiplier, shift);
48 auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
50 assert(q_fixed <= (1ll << 31));
51 if (q_fixed == (1ll << 31))
56 assert(q_fixed <= std::numeric_limits<int32_t>::max());
57 // A shift amount smaller than -31 would cause all bits to be shifted out
58 // and thus all results would be zero. We implement that instead with
59 // q_fixed==0, so as to avoid hitting issues with right-shift
60 // operations with shift amounts greater than 31. Note that this happens
61 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
62 // that we're effectively flushing tiny double_multiplier's to zero.
63 // We could conceivably handle values in the range (roughly) [32, 63]
64 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
65 // the present handling is just doing 'flush denormals to zero'. We could
66 // reconsider and actually generate nonzero denormals if a need arises.
72 *quantized_multiplier = static_cast<int32_t>(q_fixed);
75 inline void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
76 int32_t *quantized_multiplier, int *left_shift)
78 assert(double_multiplier < 1.0);
79 assert(double_multiplier > 0.0);
81 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
86 inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
88 int left_shift = shift > 0 ? shift : 0;
89 int right_shift = shift > 0 ? 0 : -shift;
90 return gemmlowp::RoundingDivideByPOT(
91 gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
95 inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
98 return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
101 inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
102 int32_t quantized_multiplier,
105 return gemmlowp::RoundingDivideByPOT(
106 gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
109 inline int NodeOffset(int b, int h, int w, int height, int width)
111 return (b * height + h) * width + w;
114 inline int CountLeadingZeros(uint32_t integer_input)
116 const uint32_t one_in_leading_positive = 1U << 31;
117 int leading_zeros = 0;
118 while (integer_input < one_in_leading_positive)
123 return leading_zeros;
126 inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
127 int32_t *output_inv_sqrt, int *output_shift)
132 // Handle the input value 1 separately to avoid overflow in that case
133 // in the general computation below (b/143972021). Also handle 0 as if it
134 // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
135 // but rare/unrealistic input value. We can expect both to occur in some
136 // incompletely trained models, but probably not in fully trained models.
137 *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
143 while (input >= (1 << 29))
148 const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
149 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
150 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
151 *output_shift -= left_shift_bit_pairs;
152 input <<= 2 * left_shift_bit_pairs;
153 assert(input >= (1 << 27));
154 assert(input < (1 << 29));
155 using gemmlowp::FixedPoint;
156 using gemmlowp::Rescale;
157 using gemmlowp::SaturatingRoundingMultiplyByPOT;
158 // Using 3 integer bits gives us enough room for the internal arithmetic in
159 // this Newton-Raphson iteration.
160 using F3 = FixedPoint<int32_t, 3>;
161 using F0 = FixedPoint<int32_t, 0>;
162 const F3 fixedpoint_input = F3::FromRaw(input >> 1);
163 const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
164 const F3 fixedpoint_half_three =
165 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
166 // Newton-Raphson iteration
167 // Naive unoptimized starting guess: x = 1
169 // Naive unoptimized number of iterations: 5
170 for (int i = 0; i < 5; i++)
172 const F3 x3 = Rescale<3>(x * x * x);
173 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
175 const F0 fixedpoint_half_sqrt_2 =
176 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
177 x = x * fixedpoint_half_sqrt_2;
178 *output_inv_sqrt = x.raw();
179 if (*output_shift < 0)
181 *output_inv_sqrt <<= -*output_shift;
184 // Convert right shift (right is positive) to left shift.
185 *output_shift *= reverse_shift;
188 // Comment from tensorflow lite:
190 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
193 // NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
194 // rectangular array of numbers.
196 // NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
197 // However, as Dims<N> is to be deprecated, this class exists as an adaptor
198 // to enable simple unoptimized implementations of element-wise broadcasting
200 template <int N> struct NdArrayDesc
202 // The "extent" of each dimension. Indices along dimension d must be in the
203 // half-open interval [0, extents[d]).
206 // The number of *elements* (not bytes) between consecutive indices of each
211 // Comment from tensorflow lite:
213 // DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
216 // Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
217 inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
219 assert(i0 >= 0 && i0 < desc.extents[0]);
220 assert(i1 >= 0 && i1 < desc.extents[1]);
221 assert(i2 >= 0 && i2 < desc.extents[2]);
222 assert(i3 >= 0 && i3 < desc.extents[3]);
223 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
226 template <int N> inline int SubscriptToIndexGeneric(const NdArrayDesc<N> *desc, int *iter)
229 for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
231 assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
232 ret_indx += iter[idx] * desc->strides[idx];
238 // Copies dims to desc, calculating strides.
239 template <int N> inline void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc<N> *desc_out)
242 for (int i = N - 1; i >= 0; --i)
244 desc_out->extents[i] = input_shape.Dims(i);
245 desc_out->strides[i] = desc_stride;
246 desc_stride *= input_shape.Dims(i);
252 NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
253 NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
255 assert(desc0_out != nullptr);
256 assert(desc1_out != nullptr);
258 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
259 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
261 // Copy dims to desc, calculating strides.
262 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
263 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
265 // Walk over each dimension. If the extents are equal do nothing.
266 // Otherwise, set the desc with extent 1 to have extent equal to the other and
268 for (int i = 0; i < N; ++i)
270 const int extent0 = extended_input0_shape.Dims(i);
271 const int extent1 = extended_input1_shape.Dims(i);
272 if (extent0 != extent1)
276 desc0_out->strides[i] = 0;
277 desc0_out->extents[i] = extent1;
281 assert(extent1 == 1);
282 desc1_out->strides[i] = 0;
283 desc1_out->extents[i] = extent0;
291 NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
292 const Shape &input2_shape, NdArrayDesc<N> *desc0_out,
293 NdArrayDesc<N> *desc1_out, NdArrayDesc<N> *desc2_out)
295 assert(desc0_out != nullptr);
296 assert(desc1_out != nullptr);
297 assert(desc2_out != nullptr);
299 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
300 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
301 auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
303 // Copy dims to desc, calculating strides.
304 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
305 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
306 CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
308 // Walk over each dimension. If the extents are equal do nothing.
309 // Otherwise, set the desc with extent 1 to have extent equal to the other and
311 for (int i = 0; i < N; ++i)
313 const int extent0 = extended_input0_shape.Dims(i);
314 const int extent1 = extended_input1_shape.Dims(i);
315 const int extent2 = extended_input2_shape.Dims(i);
317 int extent = extent0;
323 assert(extent0 == 1 || extent0 == extent);
324 assert(extent1 == 1 || extent1 == extent);
325 assert(extent2 == 1 || extent2 == extent);
327 if (!(extent0 == extent1 && extent1 == extent2))
331 desc0_out->strides[i] = 0;
332 desc0_out->extents[i] = extent;
336 desc1_out->strides[i] = 0;
337 desc1_out->extents[i] = extent;
341 desc2_out->strides[i] = 0;
342 desc2_out->extents[i] = extent;
348 // Gets next index to iterate through a multidimensional array.
349 inline bool NextIndex(const int num_dims, const int *dims, int *current)
355 assert(dims != nullptr);
356 assert(current != nullptr);
358 for (int idx = num_dims - 1; idx >= 0; --idx)
360 int current_val = current[idx] + carry;
361 assert(dims[idx] >= current_val);
362 if (dims[idx] == current_val)
368 current[idx] = current_val;
376 // Gets offset of index if reducing on axis. When reducing, the flattened offset
377 // will not change, if the input index changes on the given axis. For example,
378 // if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
379 // then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
381 // TODO(kanlig): uses Dims to represent dimensions.
382 inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
383 const int num_axis, const int *axis)
390 assert(dims != nullptr);
391 assert(index != nullptr);
394 for (int idx = 0; idx < num_dims; ++idx)
396 // if we need to skip this axis
397 bool is_axis = false;
400 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
402 if (idx == axis[axis_idx])
411 offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
417 template <typename T> void optimized_ops_preload_l1_keep(const T *ptr)
420 // builtin offered by GCC-compatible compilers including clang
421 __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
427 // Writes randomly accessed values from `input` sequentially into `output`.
428 template <typename T> class SequentialTensorWriter
431 SequentialTensorWriter(const T *input_data, T *output_data)
432 : input_data_(input_data), output_ptr_(output_data)
436 void Write(int position) { *output_ptr_++ = input_data_[position]; }
437 void WriteN(int position, int len)
439 memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
444 const T *input_data_;
451 #endif // __NNFW_CKER_UTILS_H__