1 /*******************************************************************************
2 * Copyright 2017-2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "mkldnn_thread.hpp"
18 #include "simple_sum.hpp"
24 template <data_type_t data_type>
25 void simple_sum_t<data_type>::execute() const {
26 auto output = reinterpret_cast<data_t *>(this->memory());
27 const int num_arrs = pd()->n_inputs();
28 const memory_desc_wrapper o_d(pd()->dst_pd());
29 output += o_d.blk_off(0);
30 const size_t nelems = o_d.nelems();
31 const data_t *input_ptrs[max_num_arrs];
33 for (int a = 0; a < num_arrs; ++a) {
34 const memory_desc_wrapper i_d(pd()->src_pd(a));
36 input_ptrs[a] = reinterpret_cast<const data_t *>(
37 this->input_memory(a)) + i_d.blk_off(0);
40 const size_t block_size = 16 * 1024 / sizeof(data_type);
41 const size_t blocks_number = nelems / block_size;
42 const size_t tail = nelems % block_size;
44 const auto &scales = pd()->scales_;
45 parallel(0, [&](const int ithr, const int nthr) {
46 size_t start{0}, end{0};
47 balance211(blocks_number, nthr, ithr, start, end);
49 for (size_t nb = start; nb < end; ++nb) {
50 size_t start_e = nb * block_size;
51 size_t end_e = start_e + block_size;
54 for (size_t e = start_e; e < end_e; e++) {
55 output[e] = data_t(scales[0] * input_ptrs[0][e]);
57 for (int a = 1; a < num_arrs; a++) {
59 for (size_t e = start_e; e < end_e; e++) {
60 output[e] += data_t(scales[a] * input_ptrs[a][e]);
65 if (tail != 0 && ithr == nthr - 1) {
66 size_t start_e = nelems - tail;
67 size_t end_e = nelems;
70 for (size_t e = start_e; e < end_e; e++) {
71 output[e] = data_t(scales[0] * input_ptrs[0][e]);
73 for (int a = 1; a < num_arrs; a++) {
75 for (size_t e = start_e; e < end_e; e++) {
76 output[e] += data_t(scales[a] * input_ptrs[a][e]);
83 template struct simple_sum_t<data_type::f32>;