From a756cb240a99f791bbbce24a764b163e5e52a2b6 Mon Sep 17 00:00:00 2001 From: Andrey Churbanov Date: Thu, 16 Nov 2017 10:45:07 +0000 Subject: [PATCH] Exclude untied tasks from checking of task scheduling constraint (TSC). This can improve performance of tests with untied tasks. Differential Revision: https://reviews.llvm.org/D39613 llvm-svn: 318388 --- openmp/runtime/src/kmp.h | 12 +- openmp/runtime/src/kmp_global.cpp | 3 +- openmp/runtime/src/kmp_tasking.cpp | 250 +++++++++++++++++++++++-------------- 3 files changed, 164 insertions(+), 101 deletions(-) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index d301253..d02bd8b 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2028,7 +2028,7 @@ typedef enum kmp_tasking_mode { extern kmp_tasking_mode_t __kmp_tasking_mode; /* determines how/when to execute tasks */ -extern kmp_int32 __kmp_task_stealing_constraint; +extern int __kmp_task_stealing_constraint; #if OMP_40_ENABLED extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if // specified, defaults to 0 otherwise @@ -2248,13 +2248,14 @@ struct kmp_taskdata { /* aligned during dynamic allocation */ *td_dephash; // Dependencies for children tasks are tracked from here kmp_depnode_t *td_depnode; // Pointer to graph node if this task has dependencies -#endif -#if OMPT_SUPPORT - ompt_task_info_t ompt_task_info; -#endif +#endif // OMP_40_ENABLED #if OMP_45_ENABLED kmp_task_team_t *td_task_team; kmp_int32 td_size_alloc; // The size of task structure, including shareds etc. +#endif // OMP_45_ENABLED + kmp_taskdata_t *td_last_tied; // keep tied task for task scheduling constraint +#if OMPT_SUPPORT + ompt_task_info_t ompt_task_info; #endif }; // struct kmp_taskdata @@ -2312,6 +2313,7 @@ typedef struct kmp_base_task_team { kmp_int32 tt_found_proxy_tasks; /* Have we found proxy tasks since last barrier */ #endif + kmp_int32 tt_untied_task_encountered; KMP_ALIGN_CACHE volatile kmp_int32 tt_unfinished_threads; /* #threads still active */ diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 3cd4734..20f1478 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -305,8 +305,7 @@ kmp_uint64 __kmp_taskloop_min_tasks = 0; be redefined to have exactly 32 bits. */ KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4); -kmp_int32 __kmp_task_stealing_constraint = - 1; /* Constrain task stealing by default */ +int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */ #ifdef DEBUG_SUSPEND int __kmp_suspend_count = 0; diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 154a9f9..03511ba 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -806,13 +806,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, if (resumed_task == NULL) { resumed_task = taskdata->td_parent; // In a serialized task, the resumed // task is the parent - } else -#if OMP_45_ENABLED - if (!(task_team && task_team->tt.tt_found_proxy_tasks)) -#endif - { - // verify resumed task passed in points to parent - KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent); } } else { KMP_DEBUG_ASSERT(resumed_task != @@ -949,6 +942,7 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, #if OMP_40_ENABLED task->td_depnode = NULL; #endif + task->td_last_tied = task; if (set_curr_task) { // only do this init first time thread is created task->td_incomplete_child_tasks = 0; @@ -1040,6 +1034,12 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, } flags->final = 1; } + if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { + // Untied task encountered causes the TSC algorithm to check entire deque of + // the victim thread. If no untied task encountered, then checking the head + // of the deque should be enough. + KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); + } #if OMP_45_ENABLED if (flags->proxy == TASK_PROXY) { @@ -1184,6 +1184,10 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, taskdata->td_dephash = NULL; taskdata->td_depnode = NULL; #endif + if (flags->tiedness == TASK_UNTIED) + taskdata->td_last_tied = NULL; // will be set when the task is scheduled + else + taskdata->td_last_tied = taskdata; // Only need to keep track of child task counts if team parallel and tasking not // serialized or if it is a proxy task @@ -1349,6 +1353,10 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, // Invoke the task routine and pass in relevant data. // Thunks generated by gcc take a different argument list. if (!discard) { + if (taskdata->td_flags.tiedness == TASK_UNTIED) { + taskdata->td_last_tied = current_task->td_last_tied; + KMP_DEBUG_ASSERT(taskdata->td_last_tied); + } #if KMP_STATS_ENABLED KMP_COUNT_BLOCK(TASK_executed); switch (KMP_GET_THREAD_STATE()) { @@ -2029,6 +2037,10 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) { KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); if (__kmp_tasking_mode != tskm_immediate_exec) { + // mark task as waiting not on a barrier + taskdata->td_taskwait_counter += 1; + taskdata->td_taskwait_ident = loc; + taskdata->td_taskwait_thread = gtid + 1; #if USE_ITT_BUILD // For ITT the taskgroup wait is similar to taskwait until we need to // distinguish them @@ -2060,6 +2072,7 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) { __kmp_task_stealing_constraint); } } + taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting #if OMPT_SUPPORT && OMPT_OPTIONAL if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { @@ -2144,26 +2157,31 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, taskdata = thread_data->td.td_deque[tail]; if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { - // we need to check if the candidate obeys task scheduling constraint: - // only child of current task can be scheduled - kmp_taskdata_t *current = thread->th.th_current_task; - kmp_int32 level = current->td_level; - kmp_taskdata_t *parent = taskdata->td_parent; - while (parent != current && parent->td_level > level) { - parent = parent->td_parent; // check generation up to the level of the - // current task - KMP_DEBUG_ASSERT(parent != NULL); - } - if (parent != current) { - // If the tail task is not a child, then no other child can appear in the - // deque. - __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); - KA_TRACE(10, - ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " - "ntasks=%d head=%u tail=%u\n", - gtid, thread_data->td.td_deque_ntasks, - thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); - return NULL; + // we need to check if the candidate obeys task scheduling constraint (TSC) + // only descendant of all deferred tied tasks can be scheduled, checking + // the last one is enough, as it in turn is the descendant of all others + kmp_taskdata_t *current = thread->th.th_current_task->td_last_tied; + KMP_DEBUG_ASSERT(current != NULL); + // check if last tied task is not suspended on barrier + if (current->td_flags.tasktype == TASK_EXPLICIT || + current->td_taskwait_thread > 0) { // <= 0 on barrier + kmp_int32 level = current->td_level; + kmp_taskdata_t *parent = taskdata->td_parent; + while (parent != current && parent->td_level > level) { + parent = parent->td_parent; // check generation up to the level of the + // current task + KMP_DEBUG_ASSERT(parent != NULL); + } + if (parent != current) { + // The TSC does not allow to steal victim task + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " + "ntasks=%d head=%u tail=%u\n", + gtid, thread_data->td.td_deque_ntasks, + thread_data->td.td_deque_head, + thread_data->td.td_deque_tail)); + return NULL; + } } } @@ -2184,14 +2202,16 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, // __kmp_steal_task: remove a task from another thread's deque // Assume that calling thread has already checked existence of // task_team thread_data before calling this routine. -static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, +static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, kmp_task_team_t *task_team, volatile kmp_int32 *unfinished_threads, int *thread_finished, kmp_int32 is_constrained) { kmp_task_t *task; kmp_taskdata_t *taskdata; + kmp_taskdata_t *current; kmp_thread_data_t *victim_td, *threads_data; + kmp_int32 level, target; kmp_int32 victim_tid; KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); @@ -2199,25 +2219,19 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, threads_data = task_team->tt.tt_threads_data; KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition - victim_tid = victim->th.th_info.ds.ds_tid; + victim_tid = victim_thr->th.th_info.ds.ds_tid; victim_td = &threads_data[victim_tid]; KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " - "task_team=%p ntasks=%d " - "head=%u tail=%u\n", - gtid, __kmp_gtid_from_thread(victim), task_team, + "task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, __kmp_gtid_from_thread(victim_thr), task_team, victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); - if ((TCR_4(victim_td->td.td_deque_ntasks) == - 0) || // Caller should not check this condition - (TCR_PTR(victim->th.th_task_team) != - task_team)) // GEH: why would this happen? - { + if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " - "task_team=%p " - "ntasks=%d head=%u tail=%u\n", - gtid, __kmp_gtid_from_thread(victim), task_team, + "task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, __kmp_gtid_from_thread(victim_thr), task_team, victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); return NULL; @@ -2225,53 +2239,103 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); + int ntasks = TCR_4(victim_td->td.td_deque_ntasks); // Check again after we acquire the lock - if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) || - (TCR_PTR(victim->th.th_task_team) != - task_team)) // GEH: why would this happen? - { + if (ntasks == 0) { __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " - "task_team=%p " - "ntasks=%d head=%u tail=%u\n", - gtid, __kmp_gtid_from_thread(victim), task_team, - victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, - victim_td->td.td_deque_tail)); + "task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, + victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); return NULL; } KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; - if (is_constrained) { - // we need to check if the candidate obeys task scheduling constraint: - // only descendant of current task can be scheduled - kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task; - kmp_int32 level = current->td_level; - kmp_taskdata_t *parent = taskdata->td_parent; - while (parent != current && parent->td_level > level) { - parent = parent->td_parent; // check generation up to the level of the - // current task - KMP_DEBUG_ASSERT(parent != NULL); + if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { + // we need to check if the candidate obeys task scheduling constraint (TSC) + // only descendant of all deferred tied tasks can be scheduled, checking + // the last one is enough, as it in turn is the descendant of all others + current = __kmp_threads[gtid]->th.th_current_task->td_last_tied; + KMP_DEBUG_ASSERT(current != NULL); + // check if last tied task is not suspended on barrier + if (current->td_flags.tasktype == TASK_EXPLICIT || + current->td_taskwait_thread > 0) { // <= 0 on barrier + level = current->td_level; + kmp_taskdata_t *parent = taskdata->td_parent; + while (parent != current && parent->td_level > level) { + parent = parent->td_parent; // check generation up to the level of the + // current task + KMP_DEBUG_ASSERT(parent != NULL); + } + if (parent != current) { + if (!task_team->tt.tt_untied_task_encountered) { + // The TSC does not allow to steal victim task + __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); + KA_TRACE(10, + ("__kmp_steal_task(exit #3): T#%d could not steal from " + "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, + victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); + return NULL; + } + taskdata = NULL; // will check other tasks in victim's deque + } } - if (parent != current) { - // If the head task is not a descendant of the current task then do not - // steal it. No other task in victim's deque can be a descendant of the - // current task. + } + if (taskdata != NULL) { + // Bump head pointer and Wrap. + victim_td->td.td_deque_head = + (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); + } else { + int i; + // walk through victim's deque trying to steal any task + target = victim_td->td.td_deque_head; + for (i = 1; i < ntasks; ++i) { + target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); + taskdata = victim_td->td.td_deque[target]; + if (taskdata->td_flags.tiedness == TASK_TIED) { + // check if the candidate obeys the TSC + kmp_taskdata_t *parent = taskdata->td_parent; + // check generation up to the level of the current task + while (parent != current && parent->td_level > level) { + parent = parent->td_parent; + KMP_DEBUG_ASSERT(parent != NULL); + } + if (parent != current) { + // The TSC does not allow to steal the candidate + taskdata = NULL; + continue; + } else { + // found victim tied task + break; + } + } else { + // found victim untied task + break; + } + } + if (taskdata == NULL) { + // No appropriate candidate to steal found __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); - KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from " - "T#%d: task_team=%p " - "ntasks=%d head=%u tail=%u\n", - gtid, - __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr), - task_team, victim_td->td.td_deque_ntasks, + KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " + "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); return NULL; } + int prev = target; + for (i = i + 1; i < ntasks; ++i) { + // shift remaining tasks in the deque left by 1 + target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); + victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; + prev = target; + } + KMP_DEBUG_ASSERT(victim_td->td.td_deque_tail == + ((target + 1) & TASK_DEQUE_MASK(victim_td->td))); + victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) } - // Bump head pointer and Wrap. - victim_td->td.td_deque_head = - (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); if (*thread_finished) { // We need to un-mark this victim as a finished victim. This must be done // before releasing the lock, or else other threads (starting with the @@ -2287,19 +2351,16 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, *thread_finished = FALSE; } - TCW_4(victim_td->td.td_deque_ntasks, - TCR_4(victim_td->td.td_deque_ntasks) - 1); + TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); KMP_COUNT_BLOCK(TASK_stolen); - KA_TRACE( - 10, - ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p " - "ntasks=%d head=%u tail=%u\n", - gtid, taskdata, __kmp_gtid_from_thread(victim), task_team, - victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, - victim_td->td.td_deque_tail)); + KA_TRACE(10, + ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " + "task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, + ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); task = KMP_TASKDATA_TO_TASK(taskdata); return task; @@ -2325,7 +2386,7 @@ static inline int __kmp_execute_tasks_template( kmp_info_t *other_thread; kmp_taskdata_t *current_task = thread->th.th_current_task; volatile kmp_int32 *unfinished_threads; - kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0, + kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, tid = thread->th.th_info.ds.ds_tid; KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); @@ -2362,13 +2423,13 @@ static inline int __kmp_execute_tasks_template( int asleep = 1; use_own_tasks = 0; // Try to steal from the last place I stole from successfully. - if (victim == -2) { // haven't stolen anything yet - victim = threads_data[tid].td.td_deque_last_stolen; - if (victim != + if (victim_tid == -2) { // haven't stolen anything yet + victim_tid = threads_data[tid].td.td_deque_last_stolen; + if (victim_tid != -1) // if we have a last stolen from victim, get the thread - other_thread = threads_data[victim].td.td_thr; + other_thread = threads_data[victim_tid].td.td_thr; } - if (victim != -1) { // found last victim + if (victim_tid != -1) { // found last victim asleep = 0; } else if (!new_victim) { // no recent steals and we haven't already // used a new victim; select a random thread @@ -2376,12 +2437,12 @@ static inline int __kmp_execute_tasks_template( // Pick a random thread. Initial plan was to cycle through all the // threads, and only return if we tried to steal from every thread, // and failed. Arch says that's not such a great idea. - victim = __kmp_get_random(thread) % (nthreads - 1); - if (victim >= tid) { - ++victim; // Adjusts random distribution to exclude self + victim_tid = __kmp_get_random(thread) % (nthreads - 1); + if (victim_tid >= tid) { + ++victim_tid; // Adjusts random distribution to exclude self } // Found a potential victim - other_thread = threads_data[victim].td.td_thr; + other_thread = threads_data[victim_tid].td.td_thr; // There is a slight chance that __kmp_enable_tasking() did not wake // up all threads waiting at the barrier. If victim is sleeping, // then wake it up. Since we were going to pay the cache miss @@ -2415,8 +2476,8 @@ static inline int __kmp_execute_tasks_template( is_constrained); } if (task != NULL) { // set last stolen to victim - if (threads_data[tid].td.td_deque_last_stolen != victim) { - threads_data[tid].td.td_deque_last_stolen = victim; + if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { + threads_data[tid].td.td_deque_last_stolen = victim_tid; // The pre-refactored code did not try more than 1 successful new // vicitm, unless the last one generated more local tasks; // new_victim keeps track of this @@ -2424,7 +2485,7 @@ static inline int __kmp_execute_tasks_template( } } else { // No tasks found; unset last_stolen KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); - victim = -2; // no successful victim found + victim_tid = -2; // no successful victim found } } @@ -3157,6 +3218,7 @@ void __kmp_task_team_wait( #else KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); #endif + KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); TCW_SYNC_4(task_team->tt.tt_active, FALSE); KMP_MB(); -- 2.7.4