From: Shilei Tian Date: Thu, 18 Mar 2021 22:25:21 +0000 (-0400) Subject: [OpenMP] Fixed a crash in hidden helper thread X-Git-Tag: llvmorg-14-init~11970 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2df65f87c1ea81008768e14522e5d9277234ba70;p=platform%2Fupstream%2Fllvm.git [OpenMP] Fixed a crash in hidden helper thread It is reported that after enabling hidden helper thread, the program can hit the assertion `new_gtid < __kmp_threads_capacity` sometimes. The root cause is explained as follows. Let's say the default `__kmp_threads_capacity` is `N`. If hidden helper thread is enabled, `__kmp_threads_capacity` will be offset to `N+8` by default. If the number of threads we need exceeds `N+8`, e.g. via `num_threads` clause, we need to expand `__kmp_threads`. In `__kmp_expand_threads`, the expansion starts from `__kmp_threads_capacity`, and repeatedly doubling it until the new capacity meets the requirement. Let's assume the new requirement is `Y`. If `Y` happens to meet the constraint `(N+8)*2^X=Y` where `X` is the number of iterations, the new capacity is not enough because we have 8 slots for hidden helper threads. Here is an example. ``` #include int main(int argc, char *argv[]) { constexpr const size_t N = 1344; std::vector data(N); #pragma omp parallel for for (unsigned i = 0; i < N; ++i) { data[i] = i; } #pragma omp parallel for num_threads(N) for (unsigned i = 0; i < N; ++i) { data[i] += i; } return 0; } ``` My CPU is 20C40T, then `__kmp_threads_capacity` is 160. After offset, `__kmp_threads_capacity` becomes 168. `1344 = (160+8)*2^3`, then the assertions hit. Reviewed By: protze.joachim Differential Revision: https://reviews.llvm.org/D98838 --- diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 8f42a9d..8ebbd03 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -854,6 +854,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, if (TCR_PTR(__kmp_threads[0]) == NULL) { --capacity; } + // If it is not for initializing the hidden helper team, we need to take + // __kmp_hidden_helper_threads_num out of the capacity because it is included + // in __kmp_threads_capacity. + if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { + capacity -= __kmp_hidden_helper_threads_num; + } if (__kmp_nth + new_nthreads - (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > capacity) { @@ -3607,6 +3613,13 @@ int __kmp_register_root(int initial_thread) { --capacity; } + // If it is not for initializing the hidden helper team, we need to take + // __kmp_hidden_helper_threads_num out of the capacity because it is included + // in __kmp_threads_capacity. + if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { + capacity -= __kmp_hidden_helper_threads_num; + } + /* see if there are too many threads */ if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { if (__kmp_tp_cached) { @@ -3639,7 +3652,7 @@ int __kmp_register_root(int initial_thread) { /* find an available thread slot */ // Don't reassign the zero slot since we need that to only be used by // initial thread. Slots for hidden helper threads should also be skipped. - if (initial_thread && __kmp_threads[0] == NULL) { + if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { gtid = 0; } else { for (gtid = __kmp_hidden_helper_threads_num + 1; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index 35c15ee..dd23348 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -504,9 +504,10 @@ int __kmp_initial_threads_capacity(int req_nproc) { nth = (4 * __kmp_xproc); // If hidden helper task is enabled, we initialize the thread capacity with - // extra - // __kmp_hidden_helper_threads_num. - nth += __kmp_hidden_helper_threads_num; + // extra __kmp_hidden_helper_threads_num. + if (__kmp_enable_hidden_helper) { + nth += __kmp_hidden_helper_threads_num; + } if (nth > __kmp_max_nth) nth = __kmp_max_nth; diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp new file mode 100644 index 0000000..776aee9 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp @@ -0,0 +1,45 @@ +// RUN: %libomp-cxx-compile-and-run + +#include + +#include +#include +#include +#include +#include + +void dummy_root() { + // omp_get_max_threads() will do middle initialization + int nthreads = omp_get_max_threads(); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); +} + +int main(int argc, char *argv[]) { + const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), + 4 * omp_get_num_procs()), + std::numeric_limits::max()); + + std::vector data(N); + + // Create a new thread to initialize the OpenMP RTL. The new thread will not + // be taken as the "initial thread". + std::thread root(dummy_root); + +#pragma omp parallel for num_threads(N) + for (unsigned i = 0; i < N; ++i) { + data[i] = i; + } + +#pragma omp parallel for num_threads(N + 1) + for (unsigned i = 0; i < N; ++i) { + data[i] += i; + } + + for (unsigned i = 0; i < N; ++i) { + assert(data[i] == 2 * i); + } + + root.join(); + + return 0; +} diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp new file mode 100644 index 0000000..a9d394f --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp @@ -0,0 +1,31 @@ +// RUN: %libomp-cxx-compile-and-run + +#include + +#include +#include +#include + +int main(int argc, char *argv[]) { + const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), + 4 * omp_get_num_procs()), + std::numeric_limits::max()); + + std::vector data(N); + +#pragma omp parallel for num_threads(N) + for (unsigned i = 0; i < N; ++i) { + data[i] = i; + } + +#pragma omp parallel for num_threads(N + 1) + for (unsigned i = 0; i < N; ++i) { + data[i] += i; + } + + for (unsigned i = 0; i < N; ++i) { + assert(data[i] == 2 * i); + } + + return 0; +}