compute/cker/include/cker/operation/Reduce.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_REDUCE_H__
  19 #define __NNFW_CKER_REDUCE_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Types.h"
  23 #include "cker/Utils.h"
  24 #include "cker/neon/neon_check.h"
  25
  26 namespace nnfw
  27 {
  28 namespace cker
  29 {
  30
  31 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
  32 // This method iterates through input data and reduce elements along the
  33 // dimensions given in axis.
  34
  35 #ifdef USE_NEON
  36 inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
  37                                float *output_data)
  38 {
  39   const auto input_dims = input_shape.DimsData();
  40   const auto input_num_dims = input_shape.DimensionsCount();
  41
  42   int input_size = 1;
  43   int reduce_size = 0;
  44   for (int idx = 0; idx < input_num_dims - 1; idx++)
  45   {
  46     input_size *= input_dims[idx];
  47   }
  48   reduce_size = input_dims[input_num_dims - 1];
  49   for (int idx = 0; idx < input_size; idx++)
  50   {
  51     int r_idx = 0;
  52     float tmp_data[4] = {
  53         0,
  54     };
  55     float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
  56     for (; r_idx <= reduce_size - 32; r_idx += 32)
  57     {
  58       float32x4_t a10 = vld1q_f32(input_data + r_idx);
  59       float32x4_t a11 = vld1q_f32(input_data + r_idx + 4);
  60       float32x4_t a12 = vld1q_f32(input_data + r_idx + 8);
  61       float32x4_t a13 = vld1q_f32(input_data + r_idx + 12);
  62       float32x4_t a20 = vld1q_f32(input_data + r_idx + 16);
  63       float32x4_t a21 = vld1q_f32(input_data + r_idx + 20);
  64       float32x4_t a22 = vld1q_f32(input_data + r_idx + 24);
  65       float32x4_t a23 = vld1q_f32(input_data + r_idx + 28);
  66
  67       float32x4_t x0 = vaddq_f32(a10, a20);
  68       float32x4_t x1 = vaddq_f32(a11, a21);
  69       float32x4_t x2 = vaddq_f32(a12, a22);
  70       float32x4_t x3 = vaddq_f32(a13, a23);
  71
  72       float32x4_t y0 = vaddq_f32(x0, x1);
  73       float32x4_t y1 = vaddq_f32(x2, x3);
  74       float32x4_t y2 = vaddq_f32(y0, y1);
  75       tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
  76     }
  77     for (; r_idx <= reduce_size - 8; r_idx += 8)
  78     {
  79       float32x4_t a1 = vld1q_f32(input_data + r_idx);
  80       float32x4_t a2 = vld1q_f32(input_data + r_idx + 4);
  81       float32x4_t x = vaddq_f32(a1, a2);
  82       tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
  83     }
  84     vst1q_f32(tmp_data, tmp_data_32x4);
  85     output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
  86
  87     for (; r_idx < reduce_size; r_idx++)
  88     {
  89       if (r_idx == 0)
  90       {
  91         output_data[idx] = input_data[idx * reduce_size];
  92       }
  93       else
  94       {
  95         output_data[idx] += input_data[idx * reduce_size + r_idx];
  96       }
  97     }
  98   }
  99 }
 100 #endif // NEON
 101
 102 template <typename In, typename Out>
 103 inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
 104                        const int *axis, const int num_axis, int *input_iter,
 105                        Out reducer(const Out current, const In in), Out *output_data)
 106 {
 107   const auto input_dims = input_shape.DimsData();
 108   const auto input_num_dims = input_shape.DimensionsCount();
 109
 110   // Reset input iterator.
 111   if (num_axis == 1 && axis[0] == input_num_dims - 1)
 112   {
 113     int input_size = 1;
 114     int reduce_size = 0;
 115     for (int idx = 0; idx < input_num_dims - 1; idx++)
 116     {
 117       input_size *= input_dims[idx];
 118     }
 119     reduce_size = input_dims[input_num_dims - 1];
 120     for (int idx = 0; idx < input_size; idx++)
 121     {
 122       for (int r_idx = 0; r_idx < reduce_size; r_idx++)
 123       {
 124         if (r_idx == 0)
 125         {
 126           output_data[idx] = input_data[idx * reduce_size];
 127         }
 128         else
 129         {
 130           output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
 131         }
 132       }
 133     }
 134     return true;
 135   }
 136
 137   for (int idx = 0; idx < input_num_dims; ++idx)
 138   {
 139     input_iter[idx] = 0;
 140   }
 141   // Iterate through input_data.
 142   do
 143   {
 144     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
 145     size_t output_offset =
 146         ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
 147     output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
 148   } while (NextIndex(input_num_dims, input_dims, input_iter));
 149   return true;
 150 }
 151
 152 // This method parses the input 'axis' to remove duplicates and handle negative
 153 // values, and returns a valid 'out_axis'
 154 inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis,
 155                         int *out_num_axis)
 156 {
 157   auto num_axis = axes.size();
 158   auto axis = axes.data();
 159
 160   *out_num_axis = 0; // Just in case.
 161   // Short-circuit axis resolution for scalars; the axis will go unused.
 162   if (num_dims == 0)
 163   {
 164     return true;
 165   }
 166   // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
 167   for (size_t idx = 0; idx < num_axis; ++idx)
 168   {
 169     // Handle negative index. A positive index 'p_idx' can be represented as a
 170     // negative index 'n_idx' as: n_idx = p_idx-num_dims
 171     // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
 172     int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
 173     assert(current >= 0 && current < num_dims);
 174     bool is_dup = false;
 175     for (int j = 0; j < *out_num_axis; ++j)
 176     {
 177       if (out_axis[j] == current)
 178       {
 179         is_dup = true;
 180         break;
 181       }
 182     }
 183     if (!is_dup)
 184     {
 185       out_axis[*out_num_axis] = current;
 186       *out_num_axis += 1;
 187     }
 188   }
 189   return true;
 190 }
 191
 192 template <typename T>
 193 inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
 194 {
 195   const auto dims = shape.DimsData();
 196   const auto num_dims = shape.DimensionsCount();
 197   size_t num_elements = 1;
 198   for (int idx = 0; idx < num_dims; ++idx)
 199   {
 200     size_t current = static_cast<size_t>(dims[idx]);
 201     // Overflow prevention.
 202     if (num_elements > std::numeric_limits<size_t>::max() / current)
 203     {
 204       return false;
 205     }
 206     num_elements *= current;
 207   }
 208   for (size_t idx = 0; idx < num_elements; ++idx)
 209   {
 210     data[idx] = init_value;
 211   }
 212   return true;
 213 }
 214
 215 class Reduce
 216 {
 217 public:
 218   Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
 219
 220   void prepare(size_t temp_index_size, size_t resolved_axis_size)
 221   {
 222     if (_prepared)
 223       return;
 224
 225     // prepare space for temp_index and resolved_axis
 226     if (temp_index_size > kMaxSmallSize)
 227       _temp_index.resize(temp_index_size);
 228     if (resolved_axis_size > kMaxSmallSize)
 229       _resolved_axis.resize(resolved_axis_size);
 230     _prepared = true;
 231   }
 232
 233   // Computes the generic value (i.e., sum/max/min/prod) of elements across
 234   // dimensions given in axis. It needs to pass in init_value and reducer.
 235   template <typename T>
 236   inline bool ReduceGeneric(const Shape &input_shape, const T *input_data,
 237                             const Shape &output_shape, T *output_data, const std::vector<int> &axes,
 238                             bool, T init_value, T reducer(const T current, const T in))
 239   {
 240     // Reset output data.
 241     if (!InitTensorDataForReduce(output_shape, init_value, output_data))
 242     {
 243       return false;
 244     }
 245
 246     // Resolve axis.
 247     int num_resolved_axis = 0;
 248     if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
 249     {
 250       return false;
 251     }
 252
 253     return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(),
 254                             num_resolved_axis, temp_index_data(), reducer, output_data);
 255   }
 256
 257   // Computes the mean of elements across dimensions given in axis.
 258   // It does so in two stages, first calculates the sum of elements along the axis
 259   // then divides it by the number of element in axis for quantized values.
 260   template <typename T, typename U>
 261   inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
 262                                  const Shape &input_shape, T *output_data,
 263                                  int32_t output_zero_point, float output_scale,
 264                                  const Shape &output_shape, const std::vector<int> &axes,
 265                                  bool /*keep_dims*/, U *temp_sum, bool compute_sum,
 266                                  U reducer(const U current, const T in))
 267   {
 268     // Reset output data.
 269     size_t num_outputs = 1;
 270     for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
 271     {
 272       size_t current = static_cast<size_t>(output_shape.Dims(idx));
 273       // Overflow prevention.
 274       if (num_outputs > std::numeric_limits<size_t>::max() / current)
 275       {
 276         return false;
 277       }
 278       num_outputs *= current;
 279     }
 280     for (size_t idx = 0; idx < num_outputs; ++idx)
 281     {
 282       output_data[idx] = T();
 283       temp_sum[idx] = U();
 284     }
 285
 286     // Resolve axis.
 287     int num_resolved_axis = 0;
 288     if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
 289     {
 290       return false;
 291     }
 292
 293     if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
 294                           num_resolved_axis, temp_index_data(), reducer, temp_sum))
 295     {
 296       return false;
 297     }
 298
 299     // Calculate mean by dividing output_data by num of aggregated element.
 300     U num_elements_in_axis = 1;
 301     for (int idx = 0; idx < num_resolved_axis; ++idx)
 302     {
 303       size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
 304       // Overflow prevention.
 305       if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
 306       {
 307         return false;
 308       }
 309       num_elements_in_axis *= current;
 310     }
 311
 312     if (num_elements_in_axis > 0)
 313     {
 314       const float scale = input_scale / output_scale;
 315       if (compute_sum)
 316       {
 317         // TODO(b/116341117): Eliminate float and do this completely in 8bit.
 318         const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
 319         for (size_t idx = 0; idx < num_outputs; ++idx)
 320         {
 321           const U value =
 322               static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
 323           output_data[idx] = static_cast<T>(value);
 324         }
 325       }
 326       else
 327       {
 328         const float bias = -input_zero_point * scale + 0.5f;
 329         for (size_t idx = 0; idx < num_outputs; ++idx)
 330         {
 331           float float_mean =
 332               static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
 333           float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
 334                                   static_cast<float>(std::numeric_limits<T>::max()));
 335           result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
 336           output_data[idx] = static_cast<T>(result);
 337         }
 338       }
 339     }
 340     return true;
 341   }
 342
 343   inline int32_t *resolved_axis_data(void)
 344   {
 345     return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
 346   }
 347   inline int32_t *temp_index_data(void)
 348   {
 349     return _temp_index.size() ? _temp_index.data() : _temp_index_small;
 350   }
 351
 352 private:
 353   std::vector<int> _temp_index;
 354   std::vector<int> _resolved_axis;
 355   bool _prepared;
 356   static constexpr int kMaxSmallSize = 4;
 357   int _temp_index_small[kMaxSmallSize];
 358   int _resolved_axis_small[kMaxSmallSize];
 359 };
 360
 361 } // namespace cker
 362 } // namespace nnfw
 363
 364 #endif // __NNFW_CKER_REDUCE_H__