1 // Copyright (C) 2018-2020 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
6 * @brief Contains declarations and definitions for sequential and multi-threading implementations.
8 * Multi-threading support is implemented in two variants: using the Threading Building Blocks library and OpenMP*
9 * product. To build a particular implementation, use the corresponding identifier: IE_THREAD_TBB, IE_THREAD_TBB_AUTO,
10 * IE_THREAD_OMP or IE_THREAD_SEQ.
12 * @file ie_parallel.hpp
19 #define IE_THREAD_TBB 0
20 #define IE_THREAD_OMP 1
21 #define IE_THREAD_SEQ 2
22 #define IE_THREAD_TBB_AUTO 3
24 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
25 #define TBB_PREVIEW_LOCAL_OBSERVER 1
26 #ifndef TBB_PREVIEW_NUMA_SUPPORT
27 #define TBB_PREVIEW_NUMA_SUPPORT 1
29 #include "tbb/blocked_range.h"
30 #include "tbb/blocked_range2d.h"
31 #include "tbb/blocked_range3d.h"
32 #include "tbb/parallel_for.h"
33 #include "tbb/parallel_reduce.h"
34 #include "tbb/parallel_sort.h"
35 #include "tbb/task_arena.h"
36 #include "tbb/task_scheduler_observer.h"
38 inline int parallel_get_max_threads() {
39 return tbb::this_task_arena::max_concurrency();
41 inline int parallel_get_num_threads() {
42 return parallel_get_max_threads();
44 inline int parallel_get_thread_num() {
45 return tbb::this_task_arena::current_thread_index();
47 inline void parallel_set_num_threads(int n) {
50 inline int parallel_get_env_threads() {
53 #if IE_THREAD == IE_THREAD_TBB
54 #define PARTITIONING , tbb::static_partitioner()
58 #elif IE_THREAD == IE_THREAD_OMP
65 /* MSVC still supports omp 2.0 only */
66 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
68 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
69 inline int parallel_get_max_threads() {
70 return omp_get_max_threads();
72 inline int parallel_get_num_threads() {
73 return omp_get_num_threads();
75 inline int parallel_get_thread_num() {
76 return omp_get_thread_num();
78 inline void parallel_set_num_threads(int n) {
79 omp_set_num_threads(n);
81 inline int parallel_get_env_threads() {
83 if (getenv("OMP_NUM_THREADS") != nullptr) {
85 env_cores = std::stoi(getenv("OMP_NUM_THREADS"));
86 } catch (const std::exception&) {
93 #elif IE_THREAD == IE_THREAD_SEQ
94 #include <algorithm> // NOLINT
95 inline int parallel_get_env_threads() {
98 inline int parallel_get_max_threads() {
101 inline int parallel_get_num_threads() {
104 inline int parallel_get_thread_num() {
107 inline void parallel_set_num_threads(int n) {
112 namespace InferenceEngine {
114 template <typename F>
115 void parallel_nt(int nthr, const F& func) {
116 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
117 if (nthr == 0) nthr = parallel_get_max_threads();
123 tbb::parallel_for(0, nthr, [&](int ithr) {
126 #elif IE_THREAD == IE_THREAD_OMP
132 #pragma omp parallel num_threads(nthr)
133 func(parallel_get_thread_num(), parallel_get_num_threads());
134 #elif IE_THREAD == IE_THREAD_SEQ
139 template <typename F>
140 void parallel_nt_static(int nthr, const F& func) {
141 #if IE_THREAD == IE_THREAD_SEQ
142 const bool serial = true;
144 const bool serial = false;
147 if (serial || nthr == 1) {
152 if (nthr == 0) nthr = parallel_get_max_threads();
153 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
159 tbb::static_partitioner {});
161 #elif IE_THREAD == IE_THREAD_OMP
163 #pragma omp parallel num_threads(nthr)
164 { func(parallel_get_thread_num(), parallel_get_num_threads()); }
168 template <typename I, typename F>
169 void parallel_sort(I begin, I end, const F& comparator) {
170 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
171 tbb::parallel_sort(begin, end, comparator);
172 #elif IE_THREAD == IE_THREAD_OMP
173 // TODO: propose OpenMP version
174 std::sort(begin, end, comparator);
175 #elif IE_THREAD == IE_THREAD_SEQ
176 std::sort(begin, end, comparator);
180 template <typename T0, typename R, typename F>
181 R parallel_sum(const T0& D0, const R& input, const F& func) {
182 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
183 return tbb::parallel_reduce(
184 tbb::blocked_range<T0>(0, D0), input,
185 [&](const tbb::blocked_range<T0>& r, R init) -> R {
187 for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) sum += func(dim1);
197 using T0_IT = typename std::make_signed<T0>::type;
202 #if IE_THREAD == IE_THREAD_OMP
203 #pragma omp parallel for reduction(+ : sum) schedule(static)
205 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
206 sum += static_cast<R>(func(dim1));
212 template <typename T0, typename T1, typename R, typename F>
213 R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) {
214 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
215 return tbb::parallel_reduce(
216 tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
217 [&](const tbb::blocked_range2d<T0, T1>& r, R init) -> R {
219 for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
220 for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
221 sum += func(dim2, dim1);
233 using T0_IT = typename std::make_signed<T0>::type;
234 using T1_IT = typename std::make_signed<T1>::type;
240 #if IE_THREAD == IE_THREAD_OMP
241 #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
243 for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
244 for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
245 sum += func(dim2, dim1);
251 template <typename T0, typename T1, typename T2, typename R, typename F>
252 R parallel_sum3d(const T0& D0, const T1& D1, const T2& D2, const R& input, const F& func) {
253 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
254 return tbb::parallel_reduce(
255 tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
256 [&](const tbb::blocked_range3d<T0, T1, T2>& r, R init) -> R {
258 for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
259 for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
260 for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
261 sum += func(dim1, dim2, dim3);
274 using T0_IT = typename std::make_signed<T0>::type;
275 using T1_IT = typename std::make_signed<T1>::type;
276 using T2_IT = typename std::make_signed<T2>::type;
283 #if IE_THREAD == IE_THREAD_OMP
284 #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
286 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
287 for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
288 for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
289 sum += func(dim1, dim2, dim3);
297 template <typename T>
298 inline T parallel_it_init(T start) {
301 template <typename T, typename Q, typename R, typename... Args>
302 inline T parallel_it_init(T start, Q& x, const R& X, Args&&... tuple) {
303 start = parallel_it_init(start, static_cast<Args>(tuple)...);
308 inline bool parallel_it_step() {
311 template <typename Q, typename R, typename... Args>
312 inline bool parallel_it_step(Q& x, const R& X, Args&&... tuple) {
313 if (parallel_it_step(static_cast<Args>(tuple)...)) {
320 template <typename T, typename Q>
321 inline void splitter(const T& n, const Q& team, const Q& tid, T& n_start, T& n_end) {
322 if (team <= 1 || n == 0) {
326 T n1 = (n + (T)team - 1) / (T)team;
328 T T1 = n - n2 * (T)team;
329 n_end = (T)tid < T1 ? n1 : n2;
330 n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
336 template <typename T0, typename F>
337 void for_1d(const int& ithr, const int& nthr, const T0& D0, const F& func) {
339 splitter(D0, nthr, ithr, d0, end);
340 for (; d0 < end; ++d0) func(d0);
343 template <typename T0, typename F>
344 void parallel_for(const T0& D0, const F& func) {
345 #if IE_THREAD == IE_THREAD_TBB
346 auto work_amount = static_cast<size_t>(D0);
347 int nthr = parallel_get_max_threads();
348 if (static_cast<size_t>(nthr) > work_amount) nthr = static_cast<int>(work_amount);
350 for_1d(0, 1, D0, func);
355 for_1d(ithr, nthr, D0, func);
357 tbb::static_partitioner());
359 #elif IE_THREAD == IE_THREAD_TBB_AUTO
360 const int nthr = parallel_get_max_threads();
361 tbb::parallel_for(0, nthr, [&](int ithr) {
362 for_1d(ithr, nthr, D0, func);
364 #elif IE_THREAD == IE_THREAD_OMP
366 for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
367 #elif IE_THREAD == IE_THREAD_SEQ
368 for_1d(0, 1, D0, func);
372 template <typename T0, typename T1, typename F>
373 void for_2d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const F& func) {
374 const size_t work_amount = (size_t)D0 * D1;
375 if (work_amount == 0) return;
376 size_t start {0}, end {0};
377 splitter(work_amount, nthr, ithr, start, end);
381 parallel_it_init(start, d0, D0, d1, D1);
382 for (size_t iwork = start; iwork < end; ++iwork) {
384 parallel_it_step(d0, D0, d1, D1);
388 template <typename T0, typename T1, typename F>
389 void parallel_for2d(const T0& D0, const T1& D1, const F& func) {
390 #if IE_THREAD == IE_THREAD_TBB
391 auto work_amount = static_cast<size_t>(D0 * D1);
392 int nthr = parallel_get_max_threads();
393 if (static_cast<size_t>(nthr) > work_amount) nthr = static_cast<int>(work_amount);
395 for_2d(0, 1, D0, D1, func);
400 for_2d(ithr, nthr, D0, D1, func);
402 tbb::static_partitioner());
404 #elif IE_THREAD == IE_THREAD_TBB_AUTO
405 const int nthr = parallel_get_max_threads();
406 tbb::parallel_for(0, nthr, [&](int ithr) {
407 for_2d(ithr, nthr, D0, D1, func);
409 #elif IE_THREAD == IE_THREAD_OMP
411 for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
412 #elif IE_THREAD == IE_THREAD_SEQ
413 for_2d(0, 1, D0, D1, func);
417 template <typename T0, typename T1, typename T2, typename F>
418 void for_3d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const F& func) {
419 const size_t work_amount = (size_t)D0 * D1 * D2;
420 if (work_amount == 0) return;
421 size_t start {0}, end {0};
422 splitter(work_amount, nthr, ithr, start, end);
427 parallel_it_init(start, d0, D0, d1, D1, d2, D2);
428 for (size_t iwork = start; iwork < end; ++iwork) {
430 parallel_it_step(d0, D0, d1, D1, d2, D2);
434 template <typename T0, typename T1, typename T2, typename F>
435 void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) {
436 #if IE_THREAD == IE_THREAD_TBB
437 auto work_amount = static_cast<size_t>(D0 * D1 * D2);
438 int nthr = parallel_get_max_threads();
439 if (static_cast<size_t>(nthr) > work_amount) nthr = static_cast<int>(work_amount);
441 for_3d(0, 1, D0, D1, D2, func);
446 for_3d(ithr, nthr, D0, D1, D2, func);
448 tbb::static_partitioner());
450 #elif IE_THREAD == IE_THREAD_TBB_AUTO
451 const int nthr = parallel_get_max_threads();
452 tbb::parallel_for(0, nthr, [&](int ithr) {
453 for_3d(ithr, nthr, D0, D1, D2, func);
455 #elif IE_THREAD == IE_THREAD_OMP
457 for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
458 #elif IE_THREAD == IE_THREAD_SEQ
459 for_3d(0, 1, D0, D1, D2, func);
463 template <typename T0, typename T1, typename T2, typename T3, typename F>
464 void for_4d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) {
465 const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
466 if (work_amount == 0) return;
467 size_t start {0}, end {0};
468 splitter(work_amount, nthr, ithr, start, end);
474 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
475 for (size_t iwork = start; iwork < end; ++iwork) {
476 func(d0, d1, d2, d3);
477 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
481 template <typename T0, typename T1, typename T2, typename T3, typename F>
482 void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) {
483 #if IE_THREAD == IE_THREAD_TBB
484 auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3);
485 int nthr = parallel_get_max_threads();
486 if (static_cast<size_t>(nthr) > work_amount) nthr = static_cast<int>(work_amount);
488 for_4d(0, 1, D0, D1, D2, D3, func);
493 for_4d(ithr, nthr, D0, D1, D2, D3, func);
495 tbb::static_partitioner());
497 #elif IE_THREAD == IE_THREAD_TBB_AUTO
498 const int nthr = parallel_get_max_threads();
499 tbb::parallel_for(0, nthr, [&](int ithr) {
500 for_4d(ithr, nthr, D0, D1, D2, D3, func);
502 #elif IE_THREAD == IE_THREAD_OMP
504 for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
505 #elif IE_THREAD == IE_THREAD_SEQ
506 for_4d(0, 1, D0, D1, D2, D3, func);
510 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
511 void for_5d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4,
513 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
514 if (work_amount == 0) return;
515 size_t start {0}, end {0};
516 splitter(work_amount, nthr, ithr, start, end);
523 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
524 for (size_t iwork = start; iwork < end; ++iwork) {
525 func(d0, d1, d2, d3, d4);
526 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
530 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
531 void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) {
532 #if IE_THREAD == IE_THREAD_TBB
533 auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3 * D4);
534 int nthr = parallel_get_max_threads();
535 if (static_cast<size_t>(nthr) > work_amount) nthr = static_cast<int>(work_amount);
537 for_5d(0, 1, D0, D1, D2, D3, D4, func);
542 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
544 tbb::static_partitioner());
546 #elif IE_THREAD == IE_THREAD_TBB_AUTO
547 const int nthr = parallel_get_max_threads();
548 tbb::parallel_for(0, nthr, [&](int ithr) {
549 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
551 #elif IE_THREAD == IE_THREAD_OMP
553 for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
554 #elif IE_THREAD == IE_THREAD_SEQ
555 for_5d(0, 1, D0, D1, D2, D3, D4, func);
559 } // namespace InferenceEngine