From 283a215c7a8e185ebdd247ff13d3b79db31070f1 Mon Sep 17 00:00:00 2001 From: Jonathan Peyton Date: Wed, 2 Mar 2016 22:47:51 +0000 Subject: [PATCH] Add new OpenMP 4.5 taskloop construct feature From the standard: The taskloop construct specifies that the iterations of one or more associated loops will be executed in parallel using OpenMP tasks. The iterations are distributed across tasks created by the construct and scheduled to be executed. This initial implementation uses a simple linear tasks distribution algorithm. Later we can add other algorithms to speedup generation of huge number of tasks (i.e., tree-like tasks generation should be faster). This needs to be put into the OpenMP runtime library in order for the compiler team to develop the compiler side of the implementation. Differential Revision: http://reviews.llvm.org/D17404 llvm-svn: 262535 --- openmp/runtime/src/dllexports | 1 + openmp/runtime/src/kmp.h | 10 +- openmp/runtime/src/kmp_tasking.c | 228 +++++++++++++++++++++++++++++ openmp/runtime/test/tasking/kmp_taskloop.c | 158 ++++++++++++++++++++ 4 files changed, 391 insertions(+), 6 deletions(-) create mode 100644 openmp/runtime/test/tasking/kmp_taskloop.c diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index 6ff5252..14cfaac 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -393,6 +393,7 @@ kmpc_set_defaults 224 __kmpc_doacross_wait 262 __kmpc_doacross_post 263 __kmpc_doacross_fini 264 + __kmpc_taskloop 266 %endif %endif diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 140bdd8..88179b6 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2205,11 +2205,7 @@ struct kmp_taskdata { /* aligned during dynamic #endif #if OMP_41_ENABLED kmp_task_team_t * td_task_team; -#endif -#if KMP_HAVE_QUAD - _Quad td_dummy; // Align structure 16-byte size since allocated just before kmp_task_t -#else - kmp_uint32 td_dummy[2]; + kmp_int32 td_size_alloc; // The size of task structure, including shareds etc. #endif }; // struct kmp_taskdata @@ -3478,7 +3474,9 @@ KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind); KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask ); KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask ); - +KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup ); #endif #endif diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c index 3ed9a9d..faf1193 100644 --- a/openmp/runtime/src/kmp_tasking.c +++ b/openmp/runtime/src/kmp_tasking.c @@ -1000,6 +1000,7 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags, #if OMP_41_ENABLED taskdata->td_flags.proxy = flags->proxy; taskdata->td_task_team = thread->th.th_task_team; + taskdata->td_size_alloc = shareds_offset + sizeof_shareds; #endif taskdata->td_flags.tasktype = TASK_EXPLICIT; @@ -2877,4 +2878,231 @@ void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask ) KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) ); } +//--------------------------------------------------------------------------------- +// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop +// +// thread: allocating thread +// task_src: pointer to source task to be duplicated +// returns: a pointer to the allocated kmp_task_t structure (task). +kmp_task_t * +__kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src ) +{ + kmp_task_t *task; + kmp_taskdata_t *taskdata; + kmp_taskdata_t *taskdata_src; + kmp_taskdata_t *parent_task = thread->th.th_current_task; + size_t shareds_offset; + size_t task_size; + + KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) ); + taskdata_src = KMP_TASK_TO_TASKDATA( task_src ); + KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task + KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT ); + task_size = taskdata_src->td_size_alloc; + + // Allocate a kmp_taskdata_t block and a kmp_task_t block. + KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) ); + #if USE_FAST_MEMORY + taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size ); + #else + taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size ); + #endif /* USE_FAST_MEMORY */ + KMP_MEMCPY(taskdata, taskdata_src, task_size); + + task = KMP_TASKDATA_TO_TASK(taskdata); + + // Initialize new task (only specific fields not affected by memcpy) + taskdata->td_task_id = KMP_GEN_TASK_ID(); + if( task->shareds != NULL ) { // need setup shareds pointer + shareds_offset = (char*)task_src->shareds - (char*)taskdata_src; + task->shareds = &((char*)taskdata)[shareds_offset]; + KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 ); + } + taskdata->td_alloc_thread = thread; + taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task + + // Only need to keep track of child task counts if team parallel and tasking not serialized + if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) { + KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) ); + if ( parent_task->td_taskgroup ) + KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) ); + // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated + if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) + KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) ); + } + + KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", + thread, taskdata, taskdata->td_parent) ); +#if OMPT_SUPPORT + __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine); +#endif + return task; +} + +// Routine optionally generated by th ecompiler for setting the lastprivate flag +// and calling needed constructors for private/firstprivate objects +// (used to form taskloop tasks from pattern task) +typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); + +//--------------------------------------------------------------------------------- +// __kmp_taskloop_linear: Start tasks of the taskloop linearly +// +// loc Source location information +// gtid Global thread ID +// task Task with whole loop iteration range +// lb Pointer to loop lower bound +// ub Pointer to loop upper bound +// st Loop stride +// sched Schedule specified 0/1/2 for none/grainsize/num_tasks +// grainsize Schedule value if specified +// task_dup Tasks duplication routine +void +__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int sched, kmp_uint64 grainsize, void *task_dup ) +{ + p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; + kmp_uint64 tc; + kmp_uint64 lower = *lb; // compiler provides global bounds here + kmp_uint64 upper = *ub; + kmp_uint64 i, num_tasks, extras; + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_taskdata_t *current_task = thread->th.th_current_task; + kmp_task_t *next_task; + kmp_int32 lastpriv = 0; + size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure + size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure + + // compute trip count + if ( st == 1 ) { // most common case + tc = upper - lower + 1; + } else if ( st < 0 ) { + tc = (lower - upper) / (-st) + 1; + } else { // st > 0 + tc = (upper - lower) / st + 1; + } + if(tc == 0) { + // free the pattern task and exit + __kmp_task_start( gtid, task, current_task ); + // do not execute anything for zero-trip loop + __kmp_task_finish( gtid, task, current_task ); + return; + } + + // compute num_tasks/grainsize based on the input provided + switch( sched ) { + case 0: // no schedule clause specified, we can choose the default + // let's try to schedule (team_size*10) tasks + grainsize = thread->th.th_team_nproc * 10; + case 2: // num_tasks provided + if( grainsize > tc ) { + num_tasks = tc; // too big num_tasks requested, adjust values + grainsize = 1; + extras = 0; + } else { + num_tasks = grainsize; + grainsize = tc / num_tasks; + extras = tc % num_tasks; + } + break; + case 1: // grainsize provided + if( grainsize > tc ) { + num_tasks = 1; // too big grainsize requested, adjust values + grainsize = tc; + extras = 0; + } else { + num_tasks = tc / grainsize; + grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations + extras = tc % num_tasks; + } + break; + default: + KMP_ASSERT2(0, "unknown scheduling of taskloop"); + } + KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + KMP_DEBUG_ASSERT(num_tasks > extras); + KMP_DEBUG_ASSERT(num_tasks > 0); + + // Main loop, launch num_tasks tasks, assign grainsize iterations each task + for( i = 0; i < num_tasks; ++i ) { + kmp_uint64 chunk_minus_1; + if( extras == 0 ) { + chunk_minus_1 = grainsize - 1; + } else { + chunk_minus_1 = grainsize; + --extras; // first extras iterations get bigger chunk (grainsize+1) + } + upper = lower + st * chunk_minus_1; + if( i == num_tasks - 1 ) { + // schedule the last task, set lastprivate flag + lastpriv = 1; +#if KMP_DEBUG + if( st == 1 ) + KMP_DEBUG_ASSERT(upper == *ub); + else if( st > 0 ) + KMP_DEBUG_ASSERT(upper+st > *ub); + else + KMP_DEBUG_ASSERT(upper+st < *ub); +#endif + } + next_task = __kmp_task_dup_alloc(thread, task); // allocate new task + *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds + *(kmp_uint64*)((char*)next_task + upper_offset) = upper; + if( ptask_dup != NULL ) + ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc. + __kmp_omp_task(gtid, next_task, true); // schedule new task + lower = upper + st; // adjust lower bound for the next iteration + } + // free the pattern task and exit + __kmp_task_start( gtid, task, current_task ); + // do not execute the pattern task, just do bookkeeping + __kmp_task_finish( gtid, task, current_task ); +} + +/*! +@ingroup TASKING +@param loc Source location information +@param gtid Global thread ID +@param task Task structure +@param if_val Value of the if clause +@param lb Pointer to loop lower bound +@param ub Pointer to loop upper bound +@param st Loop stride +@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise +@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks +@param grainsize Schedule value if specified +@param task_dup Tasks duplication routine + +Execute the taskloop construct. +*/ +void +__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_uint64 grainsize, void *task_dup ) +{ + kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task); + KMP_DEBUG_ASSERT( task != NULL ); + + KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n", + gtid, taskdata, *lb, *ub, st, grainsize, sched)); + + // check if clause value first + if( if_val == 0 ) { // if(0) specified, mark task as serial + taskdata->td_flags.task_serial = 1; + taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied + } + if( nogroup == 0 ) { + __kmpc_taskgroup( loc, gtid ); + } + + if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) { + __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup ); + } + + if( nogroup == 0 ) { + __kmpc_end_taskgroup( loc, gtid ); + } + KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid)); +} + #endif diff --git a/openmp/runtime/test/tasking/kmp_taskloop.c b/openmp/runtime/test/tasking/kmp_taskloop.c new file mode 100644 index 0000000..3b318bd --- /dev/null +++ b/openmp/runtime/test/tasking/kmp_taskloop.c @@ -0,0 +1,158 @@ +// RUN: %libomp-compile-and-run +#include +#include +#include "omp_my_sleep.h" + +#define N 4 +#define GRAIN 10 +#define STRIDE 3 + +// globals +int th_counter[N]; +int counter; + + +// Compiler-generated code (emulation) +typedef struct ident { + void* dummy; +} ident_t; + +typedef struct shar { + int(*pth_counter)[N]; + int *pcounter; + int *pj; +} *pshareds; + +typedef struct task { + pshareds shareds; + int(* routine)(int,struct task*); + int part_id; +// privates: + unsigned long long lb; // library always uses ULONG + unsigned long long ub; + int st; + int last; + int i; + int j; + int th; +} *ptask, kmp_task_t; + +typedef int(* task_entry_t)( int, ptask ); + +void +__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv) +{ +// setup lastprivate flag + task_dst->last = lastpriv; +// could be constructor calls here... +} + + +// OpenMP RTL interfaces +typedef unsigned long long kmp_uint64; +typedef long long kmp_int64; + +#ifdef __cplusplus +extern "C" { +#endif +void +__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_int64 grainsize, void *task_dup ); +ptask +__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags, + size_t sizeof_kmp_task_t, size_t sizeof_shareds, + task_entry_t task_entry ); +void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs); +int __kmpc_global_thread_num(void *id_ref); +#ifdef __cplusplus +} +#endif + + +// User's code +int task_entry(int gtid, ptask task) +{ + pshareds pshar = task->shareds; + for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) { + task->th = omp_get_thread_num(); + __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1); + __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1); + task->j = task->i; + } + my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks + if( task->last ) { + *(pshar->pj) = task->j; // lastprivate + } + return 0; +} + +int main() +{ + int i, j, gtid = __kmpc_global_thread_num(NULL); + ptask task; + pshareds psh; + omp_set_dynamic(0); + counter = 0; + for( i=0; ishareds; + psh->pth_counter = &th_counter; + psh->pcounter = &counter; + psh->pj = &j; + task->lb = 0; + task->ub = N*GRAIN*STRIDE-2; + task->st = STRIDE; + + __kmpc_taskloop( + NULL, // location + gtid, // gtid + task, // task structure + 1, // if clause value + &task->lb, // lower bound + &task->ub, // upper bound + STRIDE, // loop increment + 0, // 1 if nogroup specified + 2, // schedule type: 0-none, 1-grainsize, 2-num_tasks + N, // schedule value (ignored for type 0) + (void*)&__task_dup_entry // tasks duplication routine + ); + } // end master + } // end parallel +// check results + if( j != N*GRAIN*STRIDE-STRIDE ) { + printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE); + return 1; + } + if( counter != N*GRAIN ) { + printf("Error, counter %d != %d\n",counter,N*GRAIN); + return 1; + } + for( i=0; i