compute/cker/include/cker/operation/Reduce.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_REDUCE_H__
  19 #define __NNFW_CKER_REDUCE_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Types.h"
  23 #include "cker/Utils.h"
  24
  25 namespace nnfw
  26 {
  27 namespace cker
  28 {
  29
  30 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
  31 // This method iterates through input data and reduce elements along the
  32 // dimensions given in axis.
  33 template <typename In, typename Out>
  34 inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
  35                        const int *axis, const int num_axis, int *input_iter,
  36                        Out reducer(const Out current, const In in), Out *output_data)
  37 {
  38   const auto input_dims = input_shape.DimsData();
  39   const auto input_num_dims = input_shape.DimensionsCount();
  40
  41   // Reset input iterator.
  42   for (int idx = 0; idx < input_num_dims; ++idx)
  43   {
  44     input_iter[idx] = 0;
  45   }
  46   // Iterate through input_data.
  47   do
  48   {
  49     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
  50     size_t output_offset =
  51         ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
  52     output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
  53   } while (NextIndex(input_num_dims, input_dims, input_iter));
  54   return true;
  55 }
  56
  57 // This method parses the input 'axis' to remove duplicates and handle negative
  58 // values, and returns a valid 'out_axis'
  59 inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis,
  60                         int *out_num_axis)
  61 {
  62   auto num_axis = axes.size();
  63   auto axis = axes.data();
  64
  65   *out_num_axis = 0; // Just in case.
  66   // Short-circuit axis resolution for scalars; the axis will go unused.
  67   if (num_dims == 0)
  68   {
  69     return true;
  70   }
  71   // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
  72   for (size_t idx = 0; idx < num_axis; ++idx)
  73   {
  74     // Handle negative index. A positive index 'p_idx' can be represented as a
  75     // negative index 'n_idx' as: n_idx = p_idx-num_dims
  76     // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
  77     int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
  78     assert(current >= 0 && current < num_dims);
  79     bool is_dup = false;
  80     for (int j = 0; j < *out_num_axis; ++j)
  81     {
  82       if (out_axis[j] == current)
  83       {
  84         is_dup = true;
  85         break;
  86       }
  87     }
  88     if (!is_dup)
  89     {
  90       out_axis[*out_num_axis] = current;
  91       *out_num_axis += 1;
  92     }
  93   }
  94   return true;
  95 }
  96
  97 template <typename T>
  98 inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
  99 {
 100   const auto dims = shape.DimsData();
 101   const auto num_dims = shape.DimensionsCount();
 102   size_t num_elements = 1;
 103   for (int idx = 0; idx < num_dims; ++idx)
 104   {
 105     size_t current = static_cast<size_t>(dims[idx]);
 106     // Overflow prevention.
 107     if (num_elements > std::numeric_limits<size_t>::max() / current)
 108     {
 109       return false;
 110     }
 111     num_elements *= current;
 112   }
 113   for (size_t idx = 0; idx < num_elements; ++idx)
 114   {
 115     data[idx] = init_value;
 116   }
 117   return true;
 118 }
 119
 120 class Reduce
 121 {
 122 public:
 123   Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
 124
 125   void prepare(size_t temp_index_size, size_t resolved_axis_size)
 126   {
 127     if (_prepared)
 128       return;
 129
 130     // prepare space for temp_index and resolved_axis
 131     if (temp_index_size > kMaxSmallSize)
 132       _temp_index.resize(temp_index_size);
 133     if (resolved_axis_size > kMaxSmallSize)
 134       _resolved_axis.resize(resolved_axis_size);
 135     _prepared = true;
 136   }
 137
 138   // Computes the generic value (i.e., sum/max/min/prod) of elements across
 139   // dimensions given in axis. It needs to pass in init_value and reducer.
 140   template <typename T>
 141   inline bool ReduceGeneric(const Shape &input_shape, const T *input_data,
 142                             const Shape &output_shape, T *output_data, const std::vector<int> &axes,
 143                             bool, T init_value, T reducer(const T current, const T in))
 144   {
 145     // Reset output data.
 146     if (!InitTensorDataForReduce(output_shape, init_value, output_data))
 147     {
 148       return false;
 149     }
 150
 151     // Resolve axis.
 152     int num_resolved_axis = 0;
 153     if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
 154     {
 155       return false;
 156     }
 157
 158     return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(),
 159                             num_resolved_axis, temp_index_data(), reducer, output_data);
 160   }
 161
 162   // Computes the mean of elements across dimensions given in axis.
 163   // It does so in two stages, first calculates the sum of elements along the axis
 164   // then divides it by the number of element in axis for quantized values.
 165   template <typename T, typename U>
 166   inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
 167                                  const Shape &input_shape, T *output_data,
 168                                  int32_t output_zero_point, float output_scale,
 169                                  const Shape &output_shape, const std::vector<int> &axes,
 170                                  bool /*keep_dims*/, U *temp_sum, bool compute_sum,
 171                                  U reducer(const U current, const T in))
 172   {
 173     // Reset output data.
 174     size_t num_outputs = 1;
 175     for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
 176     {
 177       size_t current = static_cast<size_t>(output_shape.Dims(idx));
 178       // Overflow prevention.
 179       if (num_outputs > std::numeric_limits<size_t>::max() / current)
 180       {
 181         return false;
 182       }
 183       num_outputs *= current;
 184     }
 185     for (size_t idx = 0; idx < num_outputs; ++idx)
 186     {
 187       output_data[idx] = T();
 188       temp_sum[idx] = U();
 189     }
 190
 191     // Resolve axis.
 192     int num_resolved_axis = 0;
 193     if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
 194     {
 195       return false;
 196     }
 197
 198     if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
 199                           num_resolved_axis, temp_index_data(), reducer, temp_sum))
 200     {
 201       return false;
 202     }
 203
 204     // Calculate mean by dividing output_data by num of aggregated element.
 205     U num_elements_in_axis = 1;
 206     for (int idx = 0; idx < num_resolved_axis; ++idx)
 207     {
 208       size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
 209       // Overflow prevention.
 210       if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
 211       {
 212         return false;
 213       }
 214       num_elements_in_axis *= current;
 215     }
 216
 217     if (num_elements_in_axis > 0)
 218     {
 219       const float scale = input_scale / output_scale;
 220       if (compute_sum)
 221       {
 222         // TODO(b/116341117): Eliminate float and do this completely in 8bit.
 223         const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
 224         for (size_t idx = 0; idx < num_outputs; ++idx)
 225         {
 226           const U value =
 227               static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
 228           output_data[idx] = static_cast<T>(value);
 229         }
 230       }
 231       else
 232       {
 233         const float bias = -input_zero_point * scale + 0.5f;
 234         for (size_t idx = 0; idx < num_outputs; ++idx)
 235         {
 236           float float_mean =
 237               static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
 238           float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
 239                                   static_cast<float>(std::numeric_limits<T>::max()));
 240           result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
 241           output_data[idx] = static_cast<T>(result);
 242         }
 243       }
 244     }
 245     return true;
 246   }
 247
 248   inline int32_t *resolved_axis_data(void)
 249   {
 250     return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
 251   }
 252   inline int32_t *temp_index_data(void)
 253   {
 254     return _temp_index.size() ? _temp_index.data() : _temp_index_small;
 255   }
 256
 257 private:
 258   std::vector<int> _temp_index;
 259   std::vector<int> _resolved_axis;
 260   bool _prepared;
 261   static constexpr int kMaxSmallSize = 4;
 262   int _temp_index_small[kMaxSmallSize];
 263   int _resolved_axis_small[kMaxSmallSize];
 264 };
 265
 266 } // namespace cker
 267 } // namespace nnfw
 268
 269 #endif // __NNFW_CKER_REDUCE_H__