if (TCR_PTR(__kmp_threads[0]) == NULL) {
--capacity;
}
+ // If it is not for initializing the hidden helper team, we need to take
+ // __kmp_hidden_helper_threads_num out of the capacity because it is included
+ // in __kmp_threads_capacity.
+ if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+ capacity -= __kmp_hidden_helper_threads_num;
+ }
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
capacity) {
--capacity;
}
+ // If it is not for initializing the hidden helper team, we need to take
+ // __kmp_hidden_helper_threads_num out of the capacity because it is included
+ // in __kmp_threads_capacity.
+ if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+ capacity -= __kmp_hidden_helper_threads_num;
+ }
+
/* see if there are too many threads */
if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
if (__kmp_tp_cached) {
/* find an available thread slot */
// Don't reassign the zero slot since we need that to only be used by
// initial thread. Slots for hidden helper threads should also be skipped.
- if (initial_thread && __kmp_threads[0] == NULL) {
+ if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
gtid = 0;
} else {
for (gtid = __kmp_hidden_helper_threads_num + 1;
nth = (4 * __kmp_xproc);
// If hidden helper task is enabled, we initialize the thread capacity with
- // extra
- // __kmp_hidden_helper_threads_num.
- nth += __kmp_hidden_helper_threads_num;
+ // extra __kmp_hidden_helper_threads_num.
+ if (__kmp_enable_hidden_helper) {
+ nth += __kmp_hidden_helper_threads_num;
+ }
if (nth > __kmp_max_nth)
nth = __kmp_max_nth;
--- /dev/null
+// RUN: %libomp-cxx-compile-and-run
+
+#include <omp.h>
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <thread>
+#include <vector>
+
+void dummy_root() {
+ // omp_get_max_threads() will do middle initialization
+ int nthreads = omp_get_max_threads();
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+}
+
+int main(int argc, char *argv[]) {
+ const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+ 4 * omp_get_num_procs()),
+ std::numeric_limits<int>::max());
+
+ std::vector<int> data(N);
+
+ // Create a new thread to initialize the OpenMP RTL. The new thread will not
+ // be taken as the "initial thread".
+ std::thread root(dummy_root);
+
+#pragma omp parallel for num_threads(N)
+ for (unsigned i = 0; i < N; ++i) {
+ data[i] = i;
+ }
+
+#pragma omp parallel for num_threads(N + 1)
+ for (unsigned i = 0; i < N; ++i) {
+ data[i] += i;
+ }
+
+ for (unsigned i = 0; i < N; ++i) {
+ assert(data[i] == 2 * i);
+ }
+
+ root.join();
+
+ return 0;
+}
--- /dev/null
+// RUN: %libomp-cxx-compile-and-run
+
+#include <omp.h>
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+ const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+ 4 * omp_get_num_procs()),
+ std::numeric_limits<int>::max());
+
+ std::vector<int> data(N);
+
+#pragma omp parallel for num_threads(N)
+ for (unsigned i = 0; i < N; ++i) {
+ data[i] = i;
+ }
+
+#pragma omp parallel for num_threads(N + 1)
+ for (unsigned i = 0; i < N; ++i) {
+ data[i] += i;
+ }
+
+ for (unsigned i = 0; i < N; ++i) {
+ assert(data[i] == 2 * i);
+ }
+
+ return 0;
+}