Publishing 2019 R3 content
[platform/upstream/dldt.git] / inference-engine / src / extension / ext_reduce.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "ext_list.hpp"
6 #include "ext_base.hpp"
7
8 #include <cmath>
9 #include <limits>
10 #include <cfloat>
11 #include <string>
12 #include <vector>
13 #include <cassert>
14 #include "ie_parallel.hpp"
15
16 namespace InferenceEngine {
17 namespace Extensions {
18 namespace Cpu {
19
20 class ReduceImpl: public ExtLayerBase {
21 public:
22     explicit ReduceImpl(const CNNLayer* layer) {
23         try {
24             if (layer->insData.empty() || layer->outData.empty())
25                 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";
26
27             if (layer->insData.size() != 2)
28                 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!";
29
30             idx_dims = layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getDims();
31             if (idx_dims.size() > 1)
32                 THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";
33
34             if (layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32)
35                 THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only FP32 is supported!";
36
37             if (layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32)
38                 THROW_IE_EXCEPTION << layer->name << " Incorrect 'axes_to_reduction' input precision. Only I32 is supported!";
39
40             data_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
41             SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();
42
43             keep_dims = layer->GetParamAsBool("keep_dims", true);
44             if (keep_dims) {
45                 if (data_dims.size() != dst_dims.size())
46                     THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
47             } else {
48                 if (data_dims.size() <= dst_dims.size())
49                     THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
50             }
51
52             std::string reduce_mode = layer->type;
53             if (reduce_mode == "ReduceAnd") reduceMode = Reduce::And;
54             else if (reduce_mode == "ReduceL1") reduceMode = Reduce::L1;
55             else if (reduce_mode == "ReduceL2") reduceMode = Reduce::L2;
56             else if (reduce_mode == "ReduceLogSum") reduceMode = Reduce::LogSum;
57             else if (reduce_mode == "ReduceLogSumExp") reduceMode = Reduce::LogSumExp;
58             else if (reduce_mode == "ReduceMax") reduceMode = Reduce::Max;
59             else if (reduce_mode == "ReduceMean") reduceMode = Reduce::Mean;
60             else if (reduce_mode == "ReduceMin") reduceMode = Reduce::Min;
61             else if (reduce_mode == "ReduceOr") reduceMode = Reduce::Or;
62             else if (reduce_mode == "ReduceProd") reduceMode = Reduce::Prod;
63             else if (reduce_mode == "ReduceSum") reduceMode = Reduce::Sum;
64             else if (reduce_mode == "ReduceSumSquare") reduceMode = Reduce::SumSquare;
65             else
66                 THROW_IE_EXCEPTION << layer->name << " Incorrect Reduce layer type!";
67
68             src_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
69             srcStrides = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();
70
71             addConfig(layer, { { ConfLayout::PLN, false }, { ConfLayout::PLN, false } }, { { ConfLayout::PLN, false } });
72         } catch (InferenceEngine::details::InferenceEngineException &ex) {
73             errorMsg = ex.what();
74         }
75     }
76
77     StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
78         int32_t *idx_data = inputs[REDUCE_INDEXES]->cbuffer().as<int32_t *>() +
79                             inputs[REDUCE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
80         SizeVector axes;
81         for (size_t i = 0; i < idx_dims[0]; i++) {
82             int32_t axis = idx_data[i];
83             if (axis < 0)
84                 axis += data_dims.size();
85
86             if (static_cast<size_t>(axis) > data_dims.size()) {
87                 if (resp) {
88                     std::string errorMsg = "Index to reduce exceeds data tensor dimension";
89                     errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
90                 }
91                 return PARAMETER_MISMATCH;
92             }
93             axes.push_back(static_cast<size_t>(axis));
94         }
95
96         size_t reduced_dims_work_amount = 1;
97         InferenceEngine::SizeVector our_dims, out_dims, axes_for_reduction;
98         for (size_t i = 0; i < src_dims.size(); i++) {
99             bool found = false;
100             for (size_t axis : axes)
101                 if (i == axis) found = true;
102
103             if (found) {
104                 axes_for_reduction.push_back(i);
105                 reduced_dims_work_amount *= src_dims[i];
106                 if (keep_dims) out_dims.push_back(1);
107                 our_dims.push_back(1);
108             } else {
109                 out_dims.push_back(src_dims[i]);
110                 our_dims.push_back(src_dims[i]);
111             }
112         }
113
114         if (!our_dims.size())
115             our_dims = InferenceEngine::SizeVector(1, 1);
116
117         InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
118         for (size_t i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) {
119             if (out_dims[i] != dst_dims[i]) {
120                 if (resp) {
121                     std::string errorMsg = "Incorrect number of output dimensions!";
122                     errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
123                 }
124                 return PARAMETER_MISMATCH;
125             }
126         }
127
128         const float *src_data = inputs[REDUCE_DATA]->cbuffer().as<float *>() +
129             inputs[REDUCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
130         float* dst_data = outputs[0]->cbuffer().as<float *>() +
131             outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
132
133         size_t work_amount_dst;
134         if (!dst_dims.size())
135             work_amount_dst = 1;
136         else
137             work_amount_dst = outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0] * dst_dims[0];
138
139         switch (reduceMode) {
140         case Reduce::And:
141             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 1.0f,
142                    [](float x, float y)->float { return x && y; },
143                    [](float x, float y)->float { return x && y; });
144             break;
145         case Reduce::L1:
146             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
147                 [](float old, float y)->float { return old + (std::abs)(y); },
148                 [](float x, float y)->float { return x + y; });
149             break;
150         case Reduce::L2:
151             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
152                 [](float old, float y)->float { return old + y * y;},
153                 [](float x, float y)->float { return x + y; });
154
155             parallel_for(work_amount_dst, [&](size_t i) {
156                 dst_data[i] = sqrt(dst_data[i]);
157             });
158             break;
159         case Reduce::LogSum:
160             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
161                 [](float x, float y)->float { return x + y; },
162                 [](float x, float y)->float { return x + y; });
163
164             parallel_for(work_amount_dst, [&](size_t i) {
165                 dst_data[i] = logf(dst_data[i]);
166             });
167             break;
168         case Reduce::LogSumExp:
169             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
170                 [](float old, float y)->float { return old + expf(y); },
171                 [](float x, float y)->float { return x + y; });
172
173             parallel_for(work_amount_dst, [&](size_t i) {
174                 dst_data[i] = logf(dst_data[i]);
175             });
176             break;
177         case Reduce::Max:
178             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, FLT_MIN,
179                 [](float x, float y)->float { return x > y ? x : y; },
180                 [](float x, float y)->float { return x > y ? x : y; });
181             break;
182         case Reduce::Mean:
183             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
184                 [](float x, float y)->float { return x + y; },
185                 [](float x, float y)->float { return x + y; });
186
187             parallel_for(work_amount_dst, [&](size_t i) {
188                 dst_data[i] /= static_cast<float>(reduced_dims_work_amount);
189             });
190             break;
191         case Reduce::Min:
192             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, FLT_MAX,
193                 [](float x, float y)->float { return x < y ? x : y; },
194                 [](float x, float y)->float { return x < y ? x : y; });
195             break;
196         case Reduce::Or:
197             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
198                    [](float x, float y)->float { return x || y; },
199                    [](float x, float y)->float { return x || y; });
200             break;
201         case Reduce::Prod:
202             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 1.0f,
203                 [](float x, float y)->float { return x * y; },
204                 [](float x, float y)->float { return x * y; });
205             break;
206         case Reduce::Sum:
207             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
208                 [](float x, float y)->float { return x + y; },
209                 [](float x, float y)->float { return x + y; });
210             break;
211         case Reduce::SumSquare:
212             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
213                 [](float old, float y)->float { return old + y * y; },
214                 [](float x, float y)->float { return x + y; });
215             break;
216         default:
217             if (resp) {
218                 std::string errorMsg = "Incorrect Reduce layer type";
219                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
220             }
221             return GENERAL_ERROR;
222         }
223         return OK;
224     }
225
226 private:
227     template <typename F1, typename F2>
228     void reduce(const float *src_data, float* dst_data, size_t work_amount_dst, size_t reduced_dims_work_amount,
229         SizeVector axes_for_reduction, SizeVector dst_dims, float init_value, F1 func1, F2 func2);
230
231     enum class Reduce { And, L1, L2, LogSum, LogSumExp, Max, Mean, Min, Or, Prod, Sum, SumSquare };
232
233     const size_t REDUCE_DATA = 0;
234     const size_t REDUCE_INDEXES = 1;
235     bool keep_dims = true;
236     Reduce reduceMode = Reduce::Sum;
237     SizeVector data_dims;
238     SizeVector idx_dims;
239     SizeVector src_dims;
240     SizeVector srcStrides;
241 };
242
243 template <typename F1, typename F2>
244 void ReduceImpl::reduce(
245     const float *src_data,
246     float       *dst_data,
247     size_t       work_amount_dst,
248     size_t       reduced_dims_work_amount,
249     SizeVector   axes_for_reduction,
250     SizeVector   dst_dims,
251     float        init_value,
252     F1           func1,
253     F2           func2
254 ) {
255     unsigned int nthr = parallel_get_max_threads();
256     if ((work_amount_dst + 1) >= nthr) {
257         parallel_nt(0, [&](const int ithr, const int nthr) {
258             int j;
259             size_t i, start = 0, end = 0;
260             SizeVector dst_counters(dst_dims.size(), 0);
261             splitter(work_amount_dst, nthr, ithr, start, end);
262             for (j = dst_dims.size() - 1, i = start; j >= 0; j--) {
263                 dst_counters[j] = i % dst_dims[j];
264                 i /= dst_dims[j];
265             }
266             for (size_t src_idx, dst_idx = start; dst_idx < end; ++dst_idx) {
267                 float reduce_prod = init_value;
268                 bool update_idx = true;
269                 SizeVector src_counters = dst_counters;
270                 for (i = 0; i < reduced_dims_work_amount; ++i) {
271                     if (update_idx) {
272                         src_idx = 0;
273                         for (j = 0; j < static_cast<int>(src_dims.size()); ++j)
274                             src_idx += (src_counters[j] % src_dims[j]) * srcStrides[j];
275                         update_idx = false;
276                     }
277                     reduce_prod = func1(reduce_prod, src_data[src_idx]);
278                     for (j = axes_for_reduction.size() - 1; j >= 0; j--) {
279                         src_counters[axes_for_reduction[j]]++;
280                         if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) {
281                             src_idx += srcStrides[axes_for_reduction[j]];
282                             break;
283                         } else {
284                             src_counters[axes_for_reduction[j]] = 0;
285                             update_idx = true;
286                         }
287                     }
288                 }
289                 dst_data[dst_idx] = reduce_prod;
290                 for (j = dst_dims.size() - 1; j >= 0; j--) {
291                     dst_counters[j]++;
292                     if (dst_counters[j] < dst_dims[j])
293                         break;
294                     else
295                         dst_counters[j] = 0;
296                 }
297             }
298         });
299     } else {
300         std::vector<float> reduce_prod((nthr * work_amount_dst), init_value);
301         if (work_amount_dst == 1) {
302             parallel_nt(nthr, [&](const int ithr, const int nthr) {
303                 size_t i, start = 0, end = 0;
304                 splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
305                 for (i = start; i < end; ++i)
306                     reduce_prod[ithr] = func1(reduce_prod[ithr], src_data[i]);
307             });
308         } else {
309             SizeVector dstStrides(dst_dims.size(), 1);
310             for (int j = dst_dims.size() - 1; j >= 1; --j)
311                 dstStrides[j - 1] = dstStrides[j] * dst_dims[j];
312             parallel_nt(nthr, [&](const int ithr, const int nthr) {
313                 int j;
314                 bool update_idx = true;
315                 size_t i, src_idx, dst_idx = 0, start = 0, end = 0;
316                 splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
317                 SizeVector src_counters(src_dims.size(), 0);
318                 for (j = src_dims.size() - 1, src_idx = start; j >= 0; j--) {
319                     src_counters[j] = src_idx % src_dims[j];
320                     src_idx /= src_dims[j];
321                 }
322                 for (src_idx = start; src_idx < end; ++src_idx) {
323                     if (update_idx) {
324                         for (i = 0, dst_idx = 0; i < dst_dims.size(); ++i)
325                             dst_idx += (src_counters[i] % dst_dims[i]) * dstStrides[i];
326                         update_idx = false;
327                     }
328                     reduce_prod[ithr * work_amount_dst + dst_idx] = func1(reduce_prod[ithr * work_amount_dst + dst_idx], src_data[src_idx]);
329                     for (j = src_dims.size() - 1; j >= 0; j--) {
330                         src_counters[j]++;
331                         if (src_counters[j] < src_dims[j]) {
332                             if (dst_dims[j] > 1) dst_idx += dstStrides[j];
333                             break;
334                         } else {
335                             src_counters[j] = 0;
336                             update_idx = true;
337                         }
338                     }
339                 }
340             });
341         }
342         for (size_t dst_idx = 0; dst_idx < work_amount_dst; dst_idx++) {
343             for (size_t ithr = work_amount_dst; ithr < (nthr * work_amount_dst); ithr += work_amount_dst)
344                 reduce_prod[dst_idx] = func2(reduce_prod[dst_idx], reduce_prod[dst_idx + ithr]);
345             dst_data[dst_idx] = reduce_prod[dst_idx];
346         }
347     }
348 }
349
350 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceAnd);
351 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL1);
352 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL2);
353 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSum);
354 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSumExp);
355 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMax);
356 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMean);
357 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMin);
358 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceOr);
359 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceProd);
360 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSum);
361 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSumSquare);
362
363 }  // namespace Cpu
364 }  // namespace Extensions
365 }  // namespace InferenceEngine