1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "ext_list.hpp"
6 #include "ext_base.hpp"
14 #include "ie_parallel.hpp"
16 namespace InferenceEngine {
17 namespace Extensions {
20 class ReduceImpl: public ExtLayerBase {
22 explicit ReduceImpl(const CNNLayer* layer) {
24 if (layer->insData.empty() || layer->outData.empty())
25 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";
27 if (layer->insData.size() != 2)
28 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!";
30 idx_dims = layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getDims();
31 if (idx_dims.size() > 1)
32 THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";
34 if (layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32)
35 THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only FP32 is supported!";
37 if (layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32)
38 THROW_IE_EXCEPTION << layer->name << " Incorrect 'axes_to_reduction' input precision. Only I32 is supported!";
40 data_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
41 SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();
43 keep_dims = layer->GetParamAsBool("keep_dims", true);
45 if (data_dims.size() != dst_dims.size())
46 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
48 if (data_dims.size() <= dst_dims.size())
49 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
52 std::string reduce_mode = layer->type;
53 if (reduce_mode == "ReduceAnd") reduceMode = Reduce::And;
54 else if (reduce_mode == "ReduceL1") reduceMode = Reduce::L1;
55 else if (reduce_mode == "ReduceL2") reduceMode = Reduce::L2;
56 else if (reduce_mode == "ReduceLogSum") reduceMode = Reduce::LogSum;
57 else if (reduce_mode == "ReduceLogSumExp") reduceMode = Reduce::LogSumExp;
58 else if (reduce_mode == "ReduceMax") reduceMode = Reduce::Max;
59 else if (reduce_mode == "ReduceMean") reduceMode = Reduce::Mean;
60 else if (reduce_mode == "ReduceMin") reduceMode = Reduce::Min;
61 else if (reduce_mode == "ReduceOr") reduceMode = Reduce::Or;
62 else if (reduce_mode == "ReduceProd") reduceMode = Reduce::Prod;
63 else if (reduce_mode == "ReduceSum") reduceMode = Reduce::Sum;
64 else if (reduce_mode == "ReduceSumSquare") reduceMode = Reduce::SumSquare;
66 THROW_IE_EXCEPTION << layer->name << " Incorrect Reduce layer type!";
68 src_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
69 srcStrides = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();
71 addConfig(layer, { { ConfLayout::PLN, false }, { ConfLayout::PLN, false } }, { { ConfLayout::PLN, false } });
72 } catch (InferenceEngine::details::InferenceEngineException &ex) {
77 StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
78 int32_t *idx_data = inputs[REDUCE_INDEXES]->cbuffer().as<int32_t *>() +
79 inputs[REDUCE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
81 for (size_t i = 0; i < idx_dims[0]; i++) {
82 int32_t axis = idx_data[i];
84 axis += data_dims.size();
86 if (static_cast<size_t>(axis) > data_dims.size()) {
88 std::string errorMsg = "Index to reduce exceeds data tensor dimension";
89 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
91 return PARAMETER_MISMATCH;
93 axes.push_back(static_cast<size_t>(axis));
96 size_t reduced_dims_work_amount = 1;
97 InferenceEngine::SizeVector our_dims, out_dims, axes_for_reduction;
98 for (size_t i = 0; i < src_dims.size(); i++) {
100 for (size_t axis : axes)
101 if (i == axis) found = true;
104 axes_for_reduction.push_back(i);
105 reduced_dims_work_amount *= src_dims[i];
106 if (keep_dims) out_dims.push_back(1);
107 our_dims.push_back(1);
109 out_dims.push_back(src_dims[i]);
110 our_dims.push_back(src_dims[i]);
114 if (!our_dims.size())
115 our_dims = InferenceEngine::SizeVector(1, 1);
117 InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
118 for (size_t i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) {
119 if (out_dims[i] != dst_dims[i]) {
121 std::string errorMsg = "Incorrect number of output dimensions!";
122 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
124 return PARAMETER_MISMATCH;
128 const float *src_data = inputs[REDUCE_DATA]->cbuffer().as<float *>() +
129 inputs[REDUCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
130 float* dst_data = outputs[0]->cbuffer().as<float *>() +
131 outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
133 size_t work_amount_dst;
134 if (!dst_dims.size())
137 work_amount_dst = outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0] * dst_dims[0];
139 switch (reduceMode) {
141 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 1.0f,
142 [](float x, float y)->float { return x && y; },
143 [](float x, float y)->float { return x && y; });
146 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
147 [](float old, float y)->float { return old + (std::abs)(y); },
148 [](float x, float y)->float { return x + y; });
151 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
152 [](float old, float y)->float { return old + y * y;},
153 [](float x, float y)->float { return x + y; });
155 parallel_for(work_amount_dst, [&](size_t i) {
156 dst_data[i] = sqrt(dst_data[i]);
160 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
161 [](float x, float y)->float { return x + y; },
162 [](float x, float y)->float { return x + y; });
164 parallel_for(work_amount_dst, [&](size_t i) {
165 dst_data[i] = logf(dst_data[i]);
168 case Reduce::LogSumExp:
169 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
170 [](float old, float y)->float { return old + expf(y); },
171 [](float x, float y)->float { return x + y; });
173 parallel_for(work_amount_dst, [&](size_t i) {
174 dst_data[i] = logf(dst_data[i]);
178 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, FLT_MIN,
179 [](float x, float y)->float { return x > y ? x : y; },
180 [](float x, float y)->float { return x > y ? x : y; });
183 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
184 [](float x, float y)->float { return x + y; },
185 [](float x, float y)->float { return x + y; });
187 parallel_for(work_amount_dst, [&](size_t i) {
188 dst_data[i] /= static_cast<float>(reduced_dims_work_amount);
192 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, FLT_MAX,
193 [](float x, float y)->float { return x < y ? x : y; },
194 [](float x, float y)->float { return x < y ? x : y; });
197 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
198 [](float x, float y)->float { return x || y; },
199 [](float x, float y)->float { return x || y; });
202 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 1.0f,
203 [](float x, float y)->float { return x * y; },
204 [](float x, float y)->float { return x * y; });
207 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
208 [](float x, float y)->float { return x + y; },
209 [](float x, float y)->float { return x + y; });
211 case Reduce::SumSquare:
212 reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
213 [](float old, float y)->float { return old + y * y; },
214 [](float x, float y)->float { return x + y; });
218 std::string errorMsg = "Incorrect Reduce layer type";
219 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
221 return GENERAL_ERROR;
227 template <typename F1, typename F2>
228 void reduce(const float *src_data, float* dst_data, size_t work_amount_dst, size_t reduced_dims_work_amount,
229 SizeVector axes_for_reduction, SizeVector dst_dims, float init_value, F1 func1, F2 func2);
231 enum class Reduce { And, L1, L2, LogSum, LogSumExp, Max, Mean, Min, Or, Prod, Sum, SumSquare };
233 const size_t REDUCE_DATA = 0;
234 const size_t REDUCE_INDEXES = 1;
235 bool keep_dims = true;
236 Reduce reduceMode = Reduce::Sum;
237 SizeVector data_dims;
240 SizeVector srcStrides;
243 template <typename F1, typename F2>
244 void ReduceImpl::reduce(
245 const float *src_data,
247 size_t work_amount_dst,
248 size_t reduced_dims_work_amount,
249 SizeVector axes_for_reduction,
255 unsigned int nthr = parallel_get_max_threads();
256 if ((work_amount_dst + 1) >= nthr) {
257 parallel_nt(0, [&](const int ithr, const int nthr) {
259 size_t i, start = 0, end = 0;
260 SizeVector dst_counters(dst_dims.size(), 0);
261 splitter(work_amount_dst, nthr, ithr, start, end);
262 for (j = dst_dims.size() - 1, i = start; j >= 0; j--) {
263 dst_counters[j] = i % dst_dims[j];
266 for (size_t src_idx, dst_idx = start; dst_idx < end; ++dst_idx) {
267 float reduce_prod = init_value;
268 bool update_idx = true;
269 SizeVector src_counters = dst_counters;
270 for (i = 0; i < reduced_dims_work_amount; ++i) {
273 for (j = 0; j < static_cast<int>(src_dims.size()); ++j)
274 src_idx += (src_counters[j] % src_dims[j]) * srcStrides[j];
277 reduce_prod = func1(reduce_prod, src_data[src_idx]);
278 for (j = axes_for_reduction.size() - 1; j >= 0; j--) {
279 src_counters[axes_for_reduction[j]]++;
280 if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) {
281 src_idx += srcStrides[axes_for_reduction[j]];
284 src_counters[axes_for_reduction[j]] = 0;
289 dst_data[dst_idx] = reduce_prod;
290 for (j = dst_dims.size() - 1; j >= 0; j--) {
292 if (dst_counters[j] < dst_dims[j])
300 std::vector<float> reduce_prod((nthr * work_amount_dst), init_value);
301 if (work_amount_dst == 1) {
302 parallel_nt(nthr, [&](const int ithr, const int nthr) {
303 size_t i, start = 0, end = 0;
304 splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
305 for (i = start; i < end; ++i)
306 reduce_prod[ithr] = func1(reduce_prod[ithr], src_data[i]);
309 SizeVector dstStrides(dst_dims.size(), 1);
310 for (int j = dst_dims.size() - 1; j >= 1; --j)
311 dstStrides[j - 1] = dstStrides[j] * dst_dims[j];
312 parallel_nt(nthr, [&](const int ithr, const int nthr) {
314 bool update_idx = true;
315 size_t i, src_idx, dst_idx = 0, start = 0, end = 0;
316 splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
317 SizeVector src_counters(src_dims.size(), 0);
318 for (j = src_dims.size() - 1, src_idx = start; j >= 0; j--) {
319 src_counters[j] = src_idx % src_dims[j];
320 src_idx /= src_dims[j];
322 for (src_idx = start; src_idx < end; ++src_idx) {
324 for (i = 0, dst_idx = 0; i < dst_dims.size(); ++i)
325 dst_idx += (src_counters[i] % dst_dims[i]) * dstStrides[i];
328 reduce_prod[ithr * work_amount_dst + dst_idx] = func1(reduce_prod[ithr * work_amount_dst + dst_idx], src_data[src_idx]);
329 for (j = src_dims.size() - 1; j >= 0; j--) {
331 if (src_counters[j] < src_dims[j]) {
332 if (dst_dims[j] > 1) dst_idx += dstStrides[j];
342 for (size_t dst_idx = 0; dst_idx < work_amount_dst; dst_idx++) {
343 for (size_t ithr = work_amount_dst; ithr < (nthr * work_amount_dst); ithr += work_amount_dst)
344 reduce_prod[dst_idx] = func2(reduce_prod[dst_idx], reduce_prod[dst_idx + ithr]);
345 dst_data[dst_idx] = reduce_prod[dst_idx];
350 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceAnd);
351 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL1);
352 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL2);
353 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSum);
354 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSumExp);
355 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMax);
356 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMean);
357 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMin);
358 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceOr);
359 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceProd);
360 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSum);
361 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSumSquare);
364 } // namespace Extensions
365 } // namespace InferenceEngine