inference-engine/src/extension/ext_reduce.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "ext_list.hpp"
   6 #include "ext_base.hpp"
   7
   8 #include <cmath>
   9 #include <limits>
  10 #include <cfloat>
  11 #include <string>
  12 #include <vector>
  13 #include <cassert>
  14 #include "ie_parallel.hpp"
  15
  16 namespace InferenceEngine {
  17 namespace Extensions {
  18 namespace Cpu {
  19
  20 class ReduceImpl: public ExtLayerBase {
  21 public:
  22     explicit ReduceImpl(const CNNLayer* layer) {
  23         try {
  24             if (layer->insData.empty() || layer->outData.empty())
  25                 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";
  26
  27             if (layer->insData.size() != 2)
  28                 THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!";
  29
  30             idx_dims = layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getDims();
  31             if (idx_dims.size() > 1)
  32                 THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";
  33
  34             if (layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32)
  35                 THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only FP32 is supported!";
  36
  37             if (layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32)
  38                 THROW_IE_EXCEPTION << layer->name << " Incorrect 'axes_to_reduction' input precision. Only I32 is supported!";
  39
  40             data_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
  41             SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();
  42
  43             keep_dims = layer->GetParamAsBool("keep_dims", true);
  44             if (keep_dims) {
  45                 if (data_dims.size() != dst_dims.size())
  46                     THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
  47             } else {
  48                 if (data_dims.size() <= dst_dims.size())
  49                     THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
  50             }
  51
  52             std::string reduce_mode = layer->type;
  53             if (reduce_mode == "ReduceAnd") reduceMode = Reduce::And;
  54             else if (reduce_mode == "ReduceL1") reduceMode = Reduce::L1;
  55             else if (reduce_mode == "ReduceL2") reduceMode = Reduce::L2;
  56             else if (reduce_mode == "ReduceLogSum") reduceMode = Reduce::LogSum;
  57             else if (reduce_mode == "ReduceLogSumExp") reduceMode = Reduce::LogSumExp;
  58             else if (reduce_mode == "ReduceMax") reduceMode = Reduce::Max;
  59             else if (reduce_mode == "ReduceMean") reduceMode = Reduce::Mean;
  60             else if (reduce_mode == "ReduceMin") reduceMode = Reduce::Min;
  61             else if (reduce_mode == "ReduceOr") reduceMode = Reduce::Or;
  62             else if (reduce_mode == "ReduceProd") reduceMode = Reduce::Prod;
  63             else if (reduce_mode == "ReduceSum") reduceMode = Reduce::Sum;
  64             else if (reduce_mode == "ReduceSumSquare") reduceMode = Reduce::SumSquare;
  65             else
  66                 THROW_IE_EXCEPTION << layer->name << " Incorrect Reduce layer type!";
  67
  68             src_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
  69             srcStrides = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();
  70
  71             addConfig(layer, { { ConfLayout::PLN, false }, { ConfLayout::PLN, false } }, { { ConfLayout::PLN, false } });
  72         } catch (InferenceEngine::details::InferenceEngineException &ex) {
  73             errorMsg = ex.what();
  74         }
  75     }
  76
  77     StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
  78         int32_t *idx_data = inputs[REDUCE_INDEXES]->cbuffer().as<int32_t *>() +
  79                             inputs[REDUCE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
  80         SizeVector axes;
  81         for (size_t i = 0; i < idx_dims[0]; i++) {
  82             int32_t axis = idx_data[i];
  83             if (axis < 0)
  84                 axis += data_dims.size();
  85
  86             if (static_cast<size_t>(axis) > data_dims.size()) {
  87                 if (resp) {
  88                     std::string errorMsg = "Index to reduce exceeds data tensor dimension";
  89                     errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
  90                 }
  91                 return PARAMETER_MISMATCH;
  92             }
  93             axes.push_back(static_cast<size_t>(axis));
  94         }
  95
  96         size_t reduced_dims_work_amount = 1;
  97         InferenceEngine::SizeVector our_dims, out_dims, axes_for_reduction;
  98         for (size_t i = 0; i < src_dims.size(); i++) {
  99             bool found = false;
 100             for (size_t axis : axes)
 101                 if (i == axis) found = true;
 102
 103             if (found) {
 104                 axes_for_reduction.push_back(i);
 105                 reduced_dims_work_amount *= src_dims[i];
 106                 if (keep_dims) out_dims.push_back(1);
 107                 our_dims.push_back(1);
 108             } else {
 109                 out_dims.push_back(src_dims[i]);
 110                 our_dims.push_back(src_dims[i]);
 111             }
 112         }
 113
 114         if (!our_dims.size())
 115             our_dims = InferenceEngine::SizeVector(1, 1);
 116
 117         InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
 118         for (size_t i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) {
 119             if (out_dims[i] != dst_dims[i]) {
 120                 if (resp) {
 121                     std::string errorMsg = "Incorrect number of output dimensions!";
 122                     errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
 123                 }
 124                 return PARAMETER_MISMATCH;
 125             }
 126         }
 127
 128         const float *src_data = inputs[REDUCE_DATA]->cbuffer().as<float *>() +
 129             inputs[REDUCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
 130         float* dst_data = outputs[0]->cbuffer().as<float *>() +
 131             outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
 132
 133         size_t work_amount_dst;
 134         if (!dst_dims.size())
 135             work_amount_dst = 1;
 136         else
 137             work_amount_dst = outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0] * dst_dims[0];
 138
 139         switch (reduceMode) {
 140         case Reduce::And:
 141             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 1.0f,
 142                    [](float x, float y)->float { return x && y; },
 143                    [](float x, float y)->float { return x && y; });
 144             break;
 145         case Reduce::L1:
 146             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 147                 [](float old, float y)->float { return old + (std::abs)(y); },
 148                 [](float x, float y)->float { return x + y; });
 149             break;
 150         case Reduce::L2:
 151             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 152                 [](float old, float y)->float { return old + y * y;},
 153                 [](float x, float y)->float { return x + y; });
 154
 155             parallel_for(work_amount_dst, [&](size_t i) {
 156                 dst_data[i] = sqrt(dst_data[i]);
 157             });
 158             break;
 159         case Reduce::LogSum:
 160             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 161                 [](float x, float y)->float { return x + y; },
 162                 [](float x, float y)->float { return x + y; });
 163
 164             parallel_for(work_amount_dst, [&](size_t i) {
 165                 dst_data[i] = logf(dst_data[i]);
 166             });
 167             break;
 168         case Reduce::LogSumExp:
 169             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 170                 [](float old, float y)->float { return old + expf(y); },
 171                 [](float x, float y)->float { return x + y; });
 172
 173             parallel_for(work_amount_dst, [&](size_t i) {
 174                 dst_data[i] = logf(dst_data[i]);
 175             });
 176             break;
 177         case Reduce::Max:
 178             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, FLT_MIN,
 179                 [](float x, float y)->float { return x > y ? x : y; },
 180                 [](float x, float y)->float { return x > y ? x : y; });
 181             break;
 182         case Reduce::Mean:
 183             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 184                 [](float x, float y)->float { return x + y; },
 185                 [](float x, float y)->float { return x + y; });
 186
 187             parallel_for(work_amount_dst, [&](size_t i) {
 188                 dst_data[i] /= static_cast<float>(reduced_dims_work_amount);
 189             });
 190             break;
 191         case Reduce::Min:
 192             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, FLT_MAX,
 193                 [](float x, float y)->float { return x < y ? x : y; },
 194                 [](float x, float y)->float { return x < y ? x : y; });
 195             break;
 196         case Reduce::Or:
 197             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 198                    [](float x, float y)->float { return x || y; },
 199                    [](float x, float y)->float { return x || y; });
 200             break;
 201         case Reduce::Prod:
 202             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 1.0f,
 203                 [](float x, float y)->float { return x * y; },
 204                 [](float x, float y)->float { return x * y; });
 205             break;
 206         case Reduce::Sum:
 207             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 208                 [](float x, float y)->float { return x + y; },
 209                 [](float x, float y)->float { return x + y; });
 210             break;
 211         case Reduce::SumSquare:
 212             reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, 0.0f,
 213                 [](float old, float y)->float { return old + y * y; },
 214                 [](float x, float y)->float { return x + y; });
 215             break;
 216         default:
 217             if (resp) {
 218                 std::string errorMsg = "Incorrect Reduce layer type";
 219                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
 220             }
 221             return GENERAL_ERROR;
 222         }
 223         return OK;
 224     }
 225
 226 private:
 227     template <typename F1, typename F2>
 228     void reduce(const float *src_data, float* dst_data, size_t work_amount_dst, size_t reduced_dims_work_amount,
 229         SizeVector axes_for_reduction, SizeVector dst_dims, float init_value, F1 func1, F2 func2);
 230
 231     enum class Reduce { And, L1, L2, LogSum, LogSumExp, Max, Mean, Min, Or, Prod, Sum, SumSquare };
 232
 233     const size_t REDUCE_DATA = 0;
 234     const size_t REDUCE_INDEXES = 1;
 235     bool keep_dims = true;
 236     Reduce reduceMode = Reduce::Sum;
 237     SizeVector data_dims;
 238     SizeVector idx_dims;
 239     SizeVector src_dims;
 240     SizeVector srcStrides;
 241 };
 242
 243 template <typename F1, typename F2>
 244 void ReduceImpl::reduce(
 245     const float *src_data,
 246     float       *dst_data,
 247     size_t       work_amount_dst,
 248     size_t       reduced_dims_work_amount,
 249     SizeVector   axes_for_reduction,
 250     SizeVector   dst_dims,
 251     float        init_value,
 252     F1           func1,
 253     F2           func2
 254 ) {
 255     unsigned int nthr = parallel_get_max_threads();
 256     if ((work_amount_dst + 1) >= nthr) {
 257         parallel_nt(0, [&](const int ithr, const int nthr) {
 258             int j;
 259             size_t i, start = 0, end = 0;
 260             SizeVector dst_counters(dst_dims.size(), 0);
 261             splitter(work_amount_dst, nthr, ithr, start, end);
 262             for (j = dst_dims.size() - 1, i = start; j >= 0; j--) {
 263                 dst_counters[j] = i % dst_dims[j];
 264                 i /= dst_dims[j];
 265             }
 266             for (size_t src_idx, dst_idx = start; dst_idx < end; ++dst_idx) {
 267                 float reduce_prod = init_value;
 268                 bool update_idx = true;
 269                 SizeVector src_counters = dst_counters;
 270                 for (i = 0; i < reduced_dims_work_amount; ++i) {
 271                     if (update_idx) {
 272                         src_idx = 0;
 273                         for (j = 0; j < static_cast<int>(src_dims.size()); ++j)
 274                             src_idx += (src_counters[j] % src_dims[j]) * srcStrides[j];
 275                         update_idx = false;
 276                     }
 277                     reduce_prod = func1(reduce_prod, src_data[src_idx]);
 278                     for (j = axes_for_reduction.size() - 1; j >= 0; j--) {
 279                         src_counters[axes_for_reduction[j]]++;
 280                         if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) {
 281                             src_idx += srcStrides[axes_for_reduction[j]];
 282                             break;
 283                         } else {
 284                             src_counters[axes_for_reduction[j]] = 0;
 285                             update_idx = true;
 286                         }
 287                     }
 288                 }
 289                 dst_data[dst_idx] = reduce_prod;
 290                 for (j = dst_dims.size() - 1; j >= 0; j--) {
 291                     dst_counters[j]++;
 292                     if (dst_counters[j] < dst_dims[j])
 293                         break;
 294                     else
 295                         dst_counters[j] = 0;
 296                 }
 297             }
 298         });
 299     } else {
 300         std::vector<float> reduce_prod((nthr * work_amount_dst), init_value);
 301         if (work_amount_dst == 1) {
 302             parallel_nt(nthr, [&](const int ithr, const int nthr) {
 303                 size_t i, start = 0, end = 0;
 304                 splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
 305                 for (i = start; i < end; ++i)
 306                     reduce_prod[ithr] = func1(reduce_prod[ithr], src_data[i]);
 307             });
 308         } else {
 309             SizeVector dstStrides(dst_dims.size(), 1);
 310             for (int j = dst_dims.size() - 1; j >= 1; --j)
 311                 dstStrides[j - 1] = dstStrides[j] * dst_dims[j];
 312             parallel_nt(nthr, [&](const int ithr, const int nthr) {
 313                 int j;
 314                 bool update_idx = true;
 315                 size_t i, src_idx, dst_idx = 0, start = 0, end = 0;
 316                 splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
 317                 SizeVector src_counters(src_dims.size(), 0);
 318                 for (j = src_dims.size() - 1, src_idx = start; j >= 0; j--) {
 319                     src_counters[j] = src_idx % src_dims[j];
 320                     src_idx /= src_dims[j];
 321                 }
 322                 for (src_idx = start; src_idx < end; ++src_idx) {
 323                     if (update_idx) {
 324                         for (i = 0, dst_idx = 0; i < dst_dims.size(); ++i)
 325                             dst_idx += (src_counters[i] % dst_dims[i]) * dstStrides[i];
 326                         update_idx = false;
 327                     }
 328                     reduce_prod[ithr * work_amount_dst + dst_idx] = func1(reduce_prod[ithr * work_amount_dst + dst_idx], src_data[src_idx]);
 329                     for (j = src_dims.size() - 1; j >= 0; j--) {
 330                         src_counters[j]++;
 331                         if (src_counters[j] < src_dims[j]) {
 332                             if (dst_dims[j] > 1) dst_idx += dstStrides[j];
 333                             break;
 334                         } else {
 335                             src_counters[j] = 0;
 336                             update_idx = true;
 337                         }
 338                     }
 339                 }
 340             });
 341         }
 342         for (size_t dst_idx = 0; dst_idx < work_amount_dst; dst_idx++) {
 343             for (size_t ithr = work_amount_dst; ithr < (nthr * work_amount_dst); ithr += work_amount_dst)
 344                 reduce_prod[dst_idx] = func2(reduce_prod[dst_idx], reduce_prod[dst_idx + ithr]);
 345             dst_data[dst_idx] = reduce_prod[dst_idx];
 346         }
 347     }
 348 }
 349
 350 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceAnd);
 351 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL1);
 352 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL2);
 353 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSum);
 354 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSumExp);
 355 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMax);
 356 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMean);
 357 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMin);
 358 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceOr);
 359 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceProd);
 360 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSum);
 361 REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSumSquare);
 362
 363 }  // namespace Cpu
 364 }  // namespace Extensions
 365 }  // namespace InferenceEngine