inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.cpp

   1 /*******************************************************************************
   2 * Copyright 2017-2018 Intel Corporation
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 *******************************************************************************/
  16
  17 #include "mkldnn_thread.hpp"
  18 #include "simple_sum.hpp"
  19
  20 namespace mkldnn {
  21 namespace impl {
  22 namespace cpu {
  23
  24 template <data_type_t data_type>
  25 void simple_sum_t<data_type>::execute() const {
  26     auto output = reinterpret_cast<data_t *>(this->memory());
  27     const int num_arrs = pd()->n_inputs();
  28     const memory_desc_wrapper o_d(pd()->dst_pd());
  29     output += o_d.blk_off(0);
  30     const size_t nelems = o_d.nelems();
  31     const data_t *input_ptrs[max_num_arrs];
  32
  33     for (int a = 0; a < num_arrs; ++a) {
  34         const memory_desc_wrapper i_d(pd()->src_pd(a));
  35
  36         input_ptrs[a] = reinterpret_cast<const data_t *>(
  37                 this->input_memory(a)) + i_d.blk_off(0);
  38     }
  39
  40     const size_t block_size = 16 * 1024 / sizeof(data_type);
  41     const size_t blocks_number = nelems / block_size;
  42     const size_t tail = nelems % block_size;
  43
  44     const auto &scales = pd()->scales_;
  45     parallel(0, [&](const int ithr, const int nthr) {
  46         size_t start{0}, end{0};
  47         balance211(blocks_number, nthr, ithr, start, end);
  48
  49         for (size_t nb = start; nb < end; ++nb) {
  50             size_t start_e = nb * block_size;
  51             size_t end_e = start_e + block_size;
  52
  53             PRAGMA_OMP_SIMD()
  54             for (size_t e = start_e; e < end_e; e++) {
  55                 output[e] = data_t(scales[0] * input_ptrs[0][e]);
  56             }
  57             for (int a = 1; a < num_arrs; a++) {
  58                 PRAGMA_OMP_SIMD()
  59                 for (size_t e = start_e; e < end_e; e++) {
  60                     output[e] += data_t(scales[a] * input_ptrs[a][e]);
  61                 }
  62             }
  63         }
  64
  65         if (tail != 0 && ithr == nthr - 1) {
  66             size_t start_e = nelems - tail;
  67             size_t end_e = nelems;
  68
  69             PRAGMA_OMP_SIMD()
  70             for (size_t e = start_e; e < end_e; e++) {
  71                 output[e] = data_t(scales[0] * input_ptrs[0][e]);
  72             }
  73             for (int a = 1; a < num_arrs; a++) {
  74                 PRAGMA_OMP_SIMD()
  75                 for (size_t e = start_e; e < end_e; e++) {
  76                     output[e] += data_t(scales[a] * input_ptrs[a][e]);
  77                 }
  78             }
  79         }
  80     });
  81 }
  82
  83 template struct simple_sum_t<data_type::f32>;
  84
  85 }
  86 }
  87 }