2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_REDUCE_H__
19 #define __NNFW_CKER_REDUCE_H__
21 #include "cker/Shape.h"
22 #include "cker/Types.h"
23 #include "cker/Utils.h"
24 #include "cker/neon/neon_check.h"
31 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
32 // This method iterates through input data and reduce elements along the
33 // dimensions given in axis.
36 inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
39 const auto input_dims = input_shape.DimsData();
40 const auto input_num_dims = input_shape.DimensionsCount();
44 for (int idx = 0; idx < input_num_dims - 1; idx++)
46 input_size *= input_dims[idx];
48 reduce_size = input_dims[input_num_dims - 1];
49 for (int idx = 0; idx < input_size; idx++)
55 float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
56 for (; r_idx <= reduce_size - 32; r_idx += 32)
58 float32x4_t a10 = vld1q_f32(input_data + r_idx);
59 float32x4_t a11 = vld1q_f32(input_data + r_idx + 4);
60 float32x4_t a12 = vld1q_f32(input_data + r_idx + 8);
61 float32x4_t a13 = vld1q_f32(input_data + r_idx + 12);
62 float32x4_t a20 = vld1q_f32(input_data + r_idx + 16);
63 float32x4_t a21 = vld1q_f32(input_data + r_idx + 20);
64 float32x4_t a22 = vld1q_f32(input_data + r_idx + 24);
65 float32x4_t a23 = vld1q_f32(input_data + r_idx + 28);
67 float32x4_t x0 = vaddq_f32(a10, a20);
68 float32x4_t x1 = vaddq_f32(a11, a21);
69 float32x4_t x2 = vaddq_f32(a12, a22);
70 float32x4_t x3 = vaddq_f32(a13, a23);
72 float32x4_t y0 = vaddq_f32(x0, x1);
73 float32x4_t y1 = vaddq_f32(x2, x3);
74 float32x4_t y2 = vaddq_f32(y0, y1);
75 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
77 for (; r_idx <= reduce_size - 8; r_idx += 8)
79 float32x4_t a1 = vld1q_f32(input_data + r_idx);
80 float32x4_t a2 = vld1q_f32(input_data + r_idx + 4);
81 float32x4_t x = vaddq_f32(a1, a2);
82 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
84 vst1q_f32(tmp_data, tmp_data_32x4);
85 output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
87 for (; r_idx < reduce_size; r_idx++)
91 output_data[idx] = input_data[idx * reduce_size];
95 output_data[idx] += input_data[idx * reduce_size + r_idx];
102 template <typename In, typename Out>
103 inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
104 const int *axis, const int num_axis, int *input_iter,
105 Out reducer(const Out current, const In in), Out *output_data)
107 const auto input_dims = input_shape.DimsData();
108 const auto input_num_dims = input_shape.DimensionsCount();
110 // Reset input iterator.
111 if (num_axis == 1 && axis[0] == input_num_dims - 1)
115 for (int idx = 0; idx < input_num_dims - 1; idx++)
117 input_size *= input_dims[idx];
119 reduce_size = input_dims[input_num_dims - 1];
120 for (int idx = 0; idx < input_size; idx++)
122 for (int r_idx = 0; r_idx < reduce_size; r_idx++)
126 output_data[idx] = input_data[idx * reduce_size];
130 output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
137 for (int idx = 0; idx < input_num_dims; ++idx)
141 // Iterate through input_data.
144 size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
145 size_t output_offset =
146 ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
147 output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
148 } while (NextIndex(input_num_dims, input_dims, input_iter));
152 // This method parses the input 'axis' to remove duplicates and handle negative
153 // values, and returns a valid 'out_axis'
154 inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis,
157 auto num_axis = axes.size();
158 auto axis = axes.data();
160 *out_num_axis = 0; // Just in case.
161 // Short-circuit axis resolution for scalars; the axis will go unused.
166 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
167 for (size_t idx = 0; idx < num_axis; ++idx)
169 // Handle negative index. A positive index 'p_idx' can be represented as a
170 // negative index 'n_idx' as: n_idx = p_idx-num_dims
171 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
172 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
173 assert(current >= 0 && current < num_dims);
175 for (int j = 0; j < *out_num_axis; ++j)
177 if (out_axis[j] == current)
185 out_axis[*out_num_axis] = current;
192 template <typename T>
193 inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
195 const auto dims = shape.DimsData();
196 const auto num_dims = shape.DimensionsCount();
197 size_t num_elements = 1;
198 for (int idx = 0; idx < num_dims; ++idx)
200 size_t current = static_cast<size_t>(dims[idx]);
201 // Overflow prevention.
202 if (num_elements > std::numeric_limits<size_t>::max() / current)
206 num_elements *= current;
208 for (size_t idx = 0; idx < num_elements; ++idx)
210 data[idx] = init_value;
218 Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
220 void prepare(size_t temp_index_size, size_t resolved_axis_size)
225 // prepare space for temp_index and resolved_axis
226 if (temp_index_size > kMaxSmallSize)
227 _temp_index.resize(temp_index_size);
228 if (resolved_axis_size > kMaxSmallSize)
229 _resolved_axis.resize(resolved_axis_size);
233 // Computes the generic value (i.e., sum/max/min/prod) of elements across
234 // dimensions given in axis. It needs to pass in init_value and reducer.
235 template <typename T>
236 inline bool ReduceGeneric(const Shape &input_shape, const T *input_data,
237 const Shape &output_shape, T *output_data, const std::vector<int> &axes,
238 bool, T init_value, T reducer(const T current, const T in))
240 // Reset output data.
241 if (!InitTensorDataForReduce(output_shape, init_value, output_data))
247 int num_resolved_axis = 0;
248 if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
253 return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(),
254 num_resolved_axis, temp_index_data(), reducer, output_data);
257 // Computes the mean of elements across dimensions given in axis.
258 // It does so in two stages, first calculates the sum of elements along the axis
259 // then divides it by the number of element in axis for quantized values.
260 template <typename T, typename U>
261 inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
262 const Shape &input_shape, T *output_data,
263 int32_t output_zero_point, float output_scale,
264 const Shape &output_shape, const std::vector<int> &axes,
265 bool /*keep_dims*/, U *temp_sum, bool compute_sum,
266 U reducer(const U current, const T in))
268 // Reset output data.
269 size_t num_outputs = 1;
270 for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
272 size_t current = static_cast<size_t>(output_shape.Dims(idx));
273 // Overflow prevention.
274 if (num_outputs > std::numeric_limits<size_t>::max() / current)
278 num_outputs *= current;
280 for (size_t idx = 0; idx < num_outputs; ++idx)
282 output_data[idx] = T();
287 int num_resolved_axis = 0;
288 if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
293 if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
294 num_resolved_axis, temp_index_data(), reducer, temp_sum))
299 // Calculate mean by dividing output_data by num of aggregated element.
300 U num_elements_in_axis = 1;
301 for (int idx = 0; idx < num_resolved_axis; ++idx)
303 size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
304 // Overflow prevention.
305 if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
309 num_elements_in_axis *= current;
312 if (num_elements_in_axis > 0)
314 const float scale = input_scale / output_scale;
317 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
318 const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
319 for (size_t idx = 0; idx < num_outputs; ++idx)
322 static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
323 output_data[idx] = static_cast<T>(value);
328 const float bias = -input_zero_point * scale + 0.5f;
329 for (size_t idx = 0; idx < num_outputs; ++idx)
332 static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
333 float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
334 static_cast<float>(std::numeric_limits<T>::max()));
335 result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
336 output_data[idx] = static_cast<T>(result);
343 inline int32_t *resolved_axis_data(void)
345 return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
347 inline int32_t *temp_index_data(void)
349 return _temp_index.size() ? _temp_index.data() : _temp_index_small;
353 std::vector<int> _temp_index;
354 std::vector<int> _resolved_axis;
356 static constexpr int kMaxSmallSize = 4;
357 int _temp_index_small[kMaxSmallSize];
358 int _resolved_axis_small[kMaxSmallSize];
364 #endif // __NNFW_CKER_REDUCE_H__