2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_REDUCE_H__
19 #define __NNFW_CKER_REDUCE_H__
21 #include "cker/Shape.h"
22 #include "cker/Types.h"
23 #include "cker/Utils.h"
30 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
31 // This method iterates through input data and reduce elements along the
32 // dimensions given in axis.
33 template <typename In, typename Out>
34 inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
35 const int *axis, const int num_axis, int *input_iter,
36 Out reducer(const Out current, const In in), Out *output_data)
38 const auto input_dims = input_shape.DimsData();
39 const auto input_num_dims = input_shape.DimensionsCount();
41 // Reset input iterator.
42 for (int idx = 0; idx < input_num_dims; ++idx)
46 // Iterate through input_data.
49 size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
50 size_t output_offset =
51 ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
52 output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
53 } while (NextIndex(input_num_dims, input_dims, input_iter));
57 // This method parses the input 'axis' to remove duplicates and handle negative
58 // values, and returns a valid 'out_axis'
59 inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis,
62 auto num_axis = axes.size();
63 auto axis = axes.data();
65 *out_num_axis = 0; // Just in case.
66 // Short-circuit axis resolution for scalars; the axis will go unused.
71 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
72 for (size_t idx = 0; idx < num_axis; ++idx)
74 // Handle negative index. A positive index 'p_idx' can be represented as a
75 // negative index 'n_idx' as: n_idx = p_idx-num_dims
76 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
77 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
78 assert(current >= 0 && current < num_dims);
80 for (int j = 0; j < *out_num_axis; ++j)
82 if (out_axis[j] == current)
90 out_axis[*out_num_axis] = current;
98 inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
100 const auto dims = shape.DimsData();
101 const auto num_dims = shape.DimensionsCount();
102 size_t num_elements = 1;
103 for (int idx = 0; idx < num_dims; ++idx)
105 size_t current = static_cast<size_t>(dims[idx]);
106 // Overflow prevention.
107 if (num_elements > std::numeric_limits<size_t>::max() / current)
111 num_elements *= current;
113 for (size_t idx = 0; idx < num_elements; ++idx)
115 data[idx] = init_value;
123 Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
125 void prepare(size_t temp_index_size, size_t resolved_axis_size)
130 // prepare space for temp_index and resolved_axis
131 if (temp_index_size > kMaxSmallSize)
132 _temp_index.resize(temp_index_size);
133 if (resolved_axis_size > kMaxSmallSize)
134 _resolved_axis.resize(resolved_axis_size);
138 // Computes the generic value (i.e., sum/max/min/prod) of elements across
139 // dimensions given in axis. It needs to pass in init_value and reducer.
140 template <typename T>
141 inline bool ReduceGeneric(const Shape &input_shape, const T *input_data,
142 const Shape &output_shape, T *output_data, const std::vector<int> &axes,
143 bool, T init_value, T reducer(const T current, const T in))
145 // Reset output data.
146 if (!InitTensorDataForReduce(output_shape, init_value, output_data))
152 int num_resolved_axis = 0;
153 if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
158 return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(),
159 num_resolved_axis, temp_index_data(), reducer, output_data);
162 // Computes the mean of elements across dimensions given in axis.
163 // It does so in two stages, first calculates the sum of elements along the axis
164 // then divides it by the number of element in axis for quantized values.
165 template <typename T, typename U>
166 inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
167 const Shape &input_shape, T *output_data,
168 int32_t output_zero_point, float output_scale,
169 const Shape &output_shape, const std::vector<int> &axes,
170 bool /*keep_dims*/, U *temp_sum, bool compute_sum,
171 U reducer(const U current, const T in))
173 // Reset output data.
174 size_t num_outputs = 1;
175 for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
177 size_t current = static_cast<size_t>(output_shape.Dims(idx));
178 // Overflow prevention.
179 if (num_outputs > std::numeric_limits<size_t>::max() / current)
183 num_outputs *= current;
185 for (size_t idx = 0; idx < num_outputs; ++idx)
187 output_data[idx] = T();
192 int num_resolved_axis = 0;
193 if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
198 if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
199 num_resolved_axis, temp_index_data(), reducer, temp_sum))
204 // Calculate mean by dividing output_data by num of aggregated element.
205 U num_elements_in_axis = 1;
206 for (int idx = 0; idx < num_resolved_axis; ++idx)
208 size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
209 // Overflow prevention.
210 if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
214 num_elements_in_axis *= current;
217 if (num_elements_in_axis > 0)
219 const float scale = input_scale / output_scale;
222 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
223 const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
224 for (size_t idx = 0; idx < num_outputs; ++idx)
227 static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
228 output_data[idx] = static_cast<T>(value);
233 const float bias = -input_zero_point * scale + 0.5f;
234 for (size_t idx = 0; idx < num_outputs; ++idx)
237 static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
238 float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
239 static_cast<float>(std::numeric_limits<T>::max()));
240 result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
241 output_data[idx] = static_cast<T>(result);
248 inline int32_t *resolved_axis_data(void)
250 return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
252 inline int32_t *temp_index_data(void)
254 return _temp_index.size() ? _temp_index.data() : _temp_index_small;
258 std::vector<int> _temp_index;
259 std::vector<int> _resolved_axis;
261 static constexpr int kMaxSmallSize = 4;
262 int _temp_index_small[kMaxSmallSize];
263 int _resolved_axis_small[kMaxSmallSize];
269 #endif // __NNFW_CKER_REDUCE_H__