These are the actual changes in the runtime to issue OMPT-related functions. All...
authorAndrey Churbanov <Andrey.Churbanov@intel.com>
Wed, 29 Apr 2015 16:42:24 +0000 (16:42 +0000)
committerAndrey Churbanov <Andrey.Churbanov@intel.com>
Wed, 29 Apr 2015 16:42:24 +0000 (16:42 +0000)
llvm-svn: 236122

13 files changed:
openmp/runtime/src/kmp.h
openmp/runtime/src/kmp_atomic.h
openmp/runtime/src/kmp_barrier.cpp
openmp/runtime/src/kmp_csupport.c
openmp/runtime/src/kmp_dispatch.cpp
openmp/runtime/src/kmp_gsupport.c
openmp/runtime/src/kmp_lock.cpp
openmp/runtime/src/kmp_runtime.c
openmp/runtime/src/kmp_sched.cpp
openmp/runtime/src/kmp_tasking.c
openmp/runtime/src/kmp_wait_release.h
openmp/runtime/src/z_Linux_asm.s
openmp/runtime/src/z_Windows_NT-586_asm.asm

index 2f7aadd..2df7bb0 100644 (file)
@@ -3023,11 +3023,17 @@ extern kmp_info_t * __kmp_allocate_thread( kmp_root_t *root,
                                            kmp_team_t *team, int tid);
 #if OMP_40_ENABLED
 extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                                         ompt_parallel_id_t ompt_parallel_id,
+#endif
                                          kmp_proc_bind_t proc_bind,
                                          kmp_internal_control_t *new_icvs,
                                          int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
 #else
 extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                                         ompt_parallel_id_t ompt_parallel_id,
+#endif
                                          kmp_internal_control_t *new_icvs,
                                          int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
 #endif // OMP_40_ENABLED
@@ -3062,7 +3068,11 @@ enum fork_context_e
     fork_context_last
 };
 extern int __kmp_fork_call( ident_t *loc, int gtid, enum fork_context_e fork_context,
-  kmp_int32 argc, microtask_t microtask, launch_t invoker,
+  kmp_int32 argc,
+#if OMPT_SUPPORT
+  void *unwrapped_task,
+#endif
+  microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                              va_list *ap
@@ -3172,7 +3182,11 @@ extern void __kmp_clear_x87_fpu_status_word();
 
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-extern int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int npr, int argc, void *argv[] );
+extern int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int npr, int argc, void *argv[]
+#if OMPT_SUPPORT
+                                   , void **exit_frame_ptr
+#endif
+);
 
 
 /* ------------------------------------------------------------------------ */
index 9096551..419ad08 100644 (file)
 #include "kmp_os.h"
 #include "kmp_lock.h"
 
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
 // C++ build port.
 // Intel compiler does not support _Complex datatype on win.
 // Intel compiler supports _Complex datatype on lin and mac.
@@ -366,7 +370,23 @@ typedef kmp_queuing_lock_t kmp_atomic_lock_t;
 static inline void
 __kmp_acquire_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid )
 {
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_wait_atomic)) {
+        ompt_callbacks.ompt_callback(ompt_event_wait_atomic)(
+            (ompt_wait_id_t) lck);
+    }
+#endif
+
     __kmp_acquire_queuing_lock( lck, gtid );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)) {
+        ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)(
+            (ompt_wait_id_t) lck);
+    }
+#endif
 }
 
 static inline int
@@ -379,6 +399,13 @@ static inline void
 __kmp_release_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid )
 {
     __kmp_release_queuing_lock( lck, gtid );
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_atomic)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_atomic)(
+            (ompt_wait_id_t) lck);
+  }
+#endif
 }
 
 static inline void
index f6e0d0a..72c9fa8 100644 (file)
@@ -1034,10 +1034,37 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
     register kmp_team_t *team = this_thr->th.th_team;
     register int status = 0;
     ident_t *loc = __kmp_threads[gtid]->th.th_ident;
+#if OMPT_SUPPORT
+    ompt_task_id_t my_task_id;
+    ompt_parallel_id_t my_parallel_id;
+#endif
 
     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        if (ompt_status == ompt_status_track_callback) {
+            my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
+            my_parallel_id = team->t.ompt_team_info.parallel_id;
+
+            if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
+                if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
+                    ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
+                        my_parallel_id, my_task_id);
+                }
+            }
+            this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+            if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
+                    my_parallel_id, my_task_id);
+            }
+        } else {
+            this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+        }
+    }
+#endif
+
     if (! team->t.t_serialized) {
 #if USE_ITT_BUILD
         // This value will be used in itt notify events below.
@@ -1195,6 +1222,20 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
     }
     KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
                   gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
+                my_parallel_id, my_task_id);
+        }
+#endif
+        this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    }
+#endif
+
     return status;
 }
 
@@ -1286,6 +1327,16 @@ __kmp_join_barrier(int gtid)
     KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
+        ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+#endif
+
     if (__kmp_tasking_mode == tskm_extra_barrier) {
         __kmp_tasking_barrier(team, this_thr, gtid);
         KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
@@ -1401,6 +1452,22 @@ __kmp_join_barrier(int gtid)
     // TODO now, mark worker threads as done so they may be disbanded
     KMP_MB(); // Flush all pending memory write invalidates.
     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
+
+#if OMPT_SUPPORT
+    if (ompt_status == ompt_status_track) {
+#if OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
+                team->t.ompt_team_info.parallel_id,
+                team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+       }
+#endif
+
+        // return to default state
+        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
 }
 
 
index 3d368d5..f5451e9 100644 (file)
 #include "kmp_error.h"
 #include "kmp_stats.h"
 
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
 #define MAX_MESSAGE 512
 
 /* ------------------------------------------------------------------------ */
@@ -283,12 +288,23 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
     va_list     ap;
     va_start(   ap, microtask );
 
+#if OMPT_SUPPORT
+    kmp_info_t *master_th = __kmp_threads[ gtid ];
+    kmp_team_t *parent_team = master_th->th.th_team;
+    int tid = __kmp_tid_from_gtid( gtid );
+    parent_team->t.t_implicit_task_taskdata[tid].
+        ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0);
+#endif
+
 #if INCLUDE_SSC_MARKS
     SSC_MARK_FORKING();
 #endif
     __kmp_fork_call( loc, gtid, fork_context_intel,
             argc,
-            VOLATILE_CAST(microtask_t) microtask,
+#if OMPT_SUPPORT
+            VOLATILE_CAST(void *) microtask,      // "unwrapped" task
+#endif
+            VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
             VOLATILE_CAST(launch_t)    __kmp_invoke_task_func,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
@@ -303,6 +319,13 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
     __kmp_join_call( loc, gtid );
 
     va_end( ap );
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        parent_team->t.t_implicit_task_taskdata[tid].
+            ompt_task_info.frame.reenter_runtime_frame = 0;
+    }
+#endif
   }
   KMP_START_EXPLICIT_TIMER(OMP_serial);
 }
@@ -358,7 +381,10 @@ __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
 
     __kmp_fork_call( loc, gtid, fork_context_intel,
             argc,
-            VOLATILE_CAST(microtask_t) __kmp_teams_master,
+#if OMPT_SUPPORT
+            VOLATILE_CAST(void *) microtask,               // "unwrapped" task
+#endif
+            VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
             VOLATILE_CAST(launch_t)    __kmp_invoke_teams_master,
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
             &ap
@@ -662,6 +688,20 @@ __kmpc_master(ident_t *loc, kmp_int32 global_tid)
     if( KMP_MASTER_GTID( global_tid ))
         status = 1;
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (status) {
+        kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
+        kmp_team_t  *team            = this_thr -> th.th_team;
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
+            int  tid = __kmp_tid_from_gtid( global_tid );
+            ompt_callbacks.ompt_callback(ompt_event_master_begin)(
+                team->t.ompt_team_info.parallel_id,
+                team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+        }
+    }
+#endif
+
     if ( __kmp_env_consistency_check ) {
 #if KMP_USE_DYNAMIC_LOCK
         if (status)
@@ -694,6 +734,18 @@ __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
 
     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t  *team            = this_thr -> th.th_team;
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_master_end)) {
+        int  tid = __kmp_tid_from_gtid( global_tid );
+        ompt_callbacks.ompt_callback(ompt_event_master_end)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
+
     if ( __kmp_env_consistency_check ) {
         if( global_tid < 0 )
             KMP_WARNING( ThreadIdentInvalid );
@@ -729,11 +781,41 @@ __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
 
     th = __kmp_threads[ gtid ];
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        /* OMPT state update */
+        th->th.ompt_thread_info.wait_id = (uint64_t) loc;
+        th->th.ompt_thread_info.state = ompt_state_wait_ordered;
+
+        /* OMPT event callback */
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
+            ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
+                th->th.ompt_thread_info.wait_id);
+        }
+    }
+#endif
+
     if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
         (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
     else
         __kmp_parallel_deo( & gtid, & cid, loc );
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        /* OMPT state update */
+        th->th.ompt_thread_info.state = ompt_state_work_parallel;
+        th->th.ompt_thread_info.wait_id = 0;
+
+        /* OMPT event callback */
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
+            ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
+                th->th.ompt_thread_info.wait_id);
+        }
+    }
+#endif
+
 #if USE_ITT_BUILD
     __kmp_itt_ordered_start( gtid );
 #endif /* USE_ITT_BUILD */
@@ -765,6 +847,14 @@ __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
         (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
     else
         __kmp_parallel_dxo( & gtid, & cid, loc );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
+            th->th.ompt_thread_info.wait_id);
+    }
+#endif
 }
 
 #if KMP_USE_DYNAMIC_LOCK
@@ -1137,6 +1227,14 @@ __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
     // Value of 'crit' should be good for using as a critical_id of the critical section directive.
     __kmp_release_user_lock_with_checks( lck, global_tid );
 
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_critical)(
+            (uint64_t) lck);
+    }
+#endif
+
 #endif // KMP_USE_DYNAMIC_LOCK
 
     KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
@@ -1257,6 +1355,31 @@ __kmpc_single(ident_t *loc, kmp_int32 global_tid)
 {
     KMP_COUNT_BLOCK(OMP_SINGLE);
     kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t *team            = this_thr -> th.th_team;
+    int tid = __kmp_tid_from_gtid( global_tid );
+
+    if ((ompt_status == ompt_status_track_callback)) {
+        if (rc) {
+            if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
+                    team->t.ompt_team_info.parallel_id,
+                    team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
+                    team->t.ompt_team_info.microtask);
+            }
+        } else {
+            if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
+                    team->t.ompt_team_info.parallel_id,
+                    team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+            }
+            this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
+        }
+    }
+#endif
+
     return rc;
 }
 
@@ -1273,6 +1396,19 @@ void
 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
 {
     __kmp_exit_single( global_tid );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t *team            = this_thr -> th.th_team;
+    int tid = __kmp_tid_from_gtid( global_tid );
+
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
+        ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
 }
 
 /*!
@@ -1287,6 +1423,19 @@ __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
 {
     KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t *team            = this_thr -> th.th_team;
+    int tid = __kmp_tid_from_gtid( global_tid );
+
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
+        ompt_callbacks.ompt_callback(ompt_event_loop_end)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
+
     if ( __kmp_env_consistency_check )
      __kmp_pop_workshare( global_tid, ct_pdo, loc );
 }
@@ -1928,6 +2077,13 @@ __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
 
     RELEASE_LOCK( lck, gtid );
 
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
+    }
+#endif
+
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
@@ -1980,7 +2136,20 @@ __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
     __kmp_itt_lock_releasing( lck );
 #endif /* USE_ITT_BUILD */
 
-    RELEASE_NESTED_LOCK( lck, gtid );
+    int release_status = RELEASE_NESTED_LOCK( lck, gtid );
+#if OMPT_SUPPORT && OMPT_BLAME
+    if (ompt_status == ompt_status_track_callback) {
+        if (release_status == KMP_LOCK_RELEASED) {
+            if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
+                ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
+                    (uint64_t) lck);
+            }
+        } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
+            ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
+                (uint64_t) lck);
+        }
+    }
+#endif
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
index a39c8f2..2cf3d7f 100644 (file)
     #include <float.h>
 #endif
 
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
@@ -1189,6 +1194,16 @@ __kmp_dispatch_init(
       }
     }
     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+        ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+        ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+            team_info->parallel_id, task_info->task_id, team_info->microtask);
+    }
+#endif
 }
 
 /*
@@ -1339,6 +1354,24 @@ __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
 
 #endif /* KMP_GOMP_COMPAT */
 
+/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
+ * (no more work), then tell OMPT the loop is over. In some cases
+ * kmp_dispatch_fini() is not called. */
+#if OMPT_SUPPORT && OMPT_TRACE
+#define OMPT_LOOP_END                                                          \
+    if (status == 0) {                                                         \
+        if ((ompt_status == ompt_status_track_callback) &&                     \
+            ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
+            ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
+            ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
+            ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
+                team_info->parallel_id, task_info->task_id);                   \
+        }                                                                      \
+    }
+#else
+#define OMPT_LOOP_END // no-op
+#endif
+
 template< typename T >
 static int
 __kmp_dispatch_next(
@@ -1476,6 +1509,7 @@ __kmp_dispatch_next(
 #if INCLUDE_SSC_MARKS
         SSC_MARK_DISPATCH_NEXT();
 #endif
+        OMPT_LOOP_END;
         return status;
     } else {
         kmp_int32 last = 0;
@@ -2115,6 +2149,7 @@ __kmp_dispatch_next(
 #if INCLUDE_SSC_MARKS
     SSC_MARK_DISPATCH_NEXT();
 #endif
+    OMPT_LOOP_END;
     return status;
 }
 
index 09abb3e..99dc822 100644 (file)
 #include "kmp.h"
 #include "kmp_atomic.h"
 
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
 #ifdef __cplusplus
     extern "C" {
 #endif // __cplusplus
@@ -106,6 +110,11 @@ xexpand(KMP_API_NAME_GOMP_ATOMIC_START)(void)
 {
     int gtid = __kmp_entry_gtid();
     KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+    __ompt_thread_assign_wait_id(0);
+#endif
+
     __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
 }
 
@@ -246,7 +255,36 @@ void
 __kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *),
   void *data)
 {
+#if OMPT_SUPPORT
+    kmp_info_t *thr;
+    ompt_frame_t *ompt_frame;
+    ompt_state_t enclosing_state;
+
+    if (ompt_status & ompt_status_track) {
+        // get pointer to thread data structure
+        thr = __kmp_threads[*gtid];
+
+        // save enclosing task state; set current state for task
+        enclosing_state = thr->th.ompt_thread_info.state;
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+        // set task frame
+        ompt_frame = __ompt_get_task_frame_internal(0);
+        ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
     task(data);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        // clear task frame
+        ompt_frame->exit_runtime_frame = NULL;
+
+        // restore enclosing state
+        thr->th.ompt_thread_info.state = enclosing_state;
+    }
+#endif
 }
 
 
@@ -264,10 +302,37 @@ __kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
     KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
       schedule != kmp_sch_static);
 
+#if OMPT_SUPPORT
+    kmp_info_t *thr;
+    ompt_frame_t *ompt_frame;
+    ompt_state_t enclosing_state;
+
+    if (ompt_status & ompt_status_track) {
+        thr = __kmp_threads[*gtid];
+        // save enclosing task state; set current state for task
+        enclosing_state = thr->th.ompt_thread_info.state;
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+        // set task frame
+        ompt_frame = __ompt_get_task_frame_internal(0);
+        ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
     //
     // Now invoke the microtask.
     //
     task(data);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        // clear task frame
+        ompt_frame->exit_runtime_frame = NULL;
+
+        // reset enclosing state
+        thr->th.ompt_thread_info.state = enclosing_state;
+    }
+#endif
 }
 
 
@@ -275,14 +340,26 @@ __kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
 static
 #endif /* KMP_DEBUG */
 void
-__kmp_GOMP_fork_call(ident_t *loc, int gtid, microtask_t wrapper, int argc,...)
+__kmp_GOMP_fork_call(ident_t *loc, int gtid, void (*unwrapped_task)(void *), microtask_t wrapper, int argc,...)
 {
     int rc;
+    kmp_info_t *thr = __kmp_threads[gtid];
+    kmp_team_t *team = thr->th.th_team;
+    int tid = __kmp_tid_from_gtid(gtid);
 
     va_list ap;
     va_start(ap, argc);
 
-    rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper, __kmp_invoke_task_func,
+#if OMPT_SUPPORT
+    team->t.t_implicit_task_taskdata[tid].
+        ompt_task_info.frame.reenter_runtime_frame = NULL;
+#endif
+
+    rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc,
+#if OMPT_SUPPORT
+      VOLATILE_CAST(void *) unwrapped_task,
+#endif
+      wrapper, __kmp_invoke_task_func,
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
       &ap
 #else
@@ -293,10 +370,69 @@ __kmp_GOMP_fork_call(ident_t *loc, int gtid, microtask_t wrapper, int argc,...)
     va_end(ap);
 
     if (rc) {
+        __kmp_run_before_invoked_task(gtid, tid, thr, team);
+    }
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+        ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+
+        // implicit task callback
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                team_info->parallel_id, task_info->task_id);
+        }
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    }
+#endif
+}
+
+static void
+__kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid, void (*task)(void *))
+{
+    __kmp_serialized_parallel(loc, gtid);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        ompt_task_id_t ompt_task_id = __ompt_get_task_id_internal(0);
+        ompt_frame_t  *ompt_frame = __ompt_get_task_frame_internal(0);
         kmp_info_t *thr = __kmp_threads[gtid];
-        __kmp_run_before_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
-          thr->th.th_team);
+
+        ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(gtid);
+        ompt_task_id_t my_ompt_task_id = __ompt_task_id_new(gtid);
+
+        ompt_frame->exit_runtime_frame = NULL;
+
+        // parallel region callback
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
+            int team_size = 1;
+            ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
+                ompt_task_id, ompt_frame, ompt_parallel_id,
+                team_size, (void *) task);
+        }
+
+        // set up lightweight task
+        ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
+            __kmp_allocate(sizeof(ompt_lw_taskteam_t));
+        __ompt_lw_taskteam_init(lwt, thr, gtid, (void *) task, ompt_parallel_id);
+        lwt->ompt_task_info.task_id = my_ompt_task_id;
+        lwt->ompt_task_info.frame.exit_runtime_frame = 0;
+        __ompt_lw_taskteam_link(lwt, thr);
+
+#if OMPT_TRACE
+        // implicit task callback
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                ompt_parallel_id, my_ompt_task_id);
+        }
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+#endif
     }
+#endif
 }
 
 
@@ -304,6 +440,16 @@ void
 xexpand(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *), void *data, unsigned num_threads)
 {
     int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+    ompt_frame_t *parent_frame;
+
+    if (ompt_status & ompt_status_track) {
+        parent_frame = __ompt_get_task_frame_internal(0);
+        parent_frame->reenter_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
     MKLOC(loc, "GOMP_parallel_start");
     KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid));
 
@@ -311,12 +457,18 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *), void *data, unsi
         if (num_threads != 0) {
             __kmp_push_num_threads(&loc, gtid, num_threads);
         }
-        __kmp_GOMP_fork_call(&loc, gtid,
+        __kmp_GOMP_fork_call(&loc, gtid, task,
           (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data);
     }
     else {
-        __kmpc_serialized_parallel(&loc, gtid);
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        parent_frame->reenter_runtime_frame = NULL;
     }
+#endif
 }
 
 
@@ -324,9 +476,39 @@ void
 xexpand(KMP_API_NAME_GOMP_PARALLEL_END)(void)
 {
     int gtid = __kmp_get_gtid();
+    kmp_info_t *thr = __kmp_threads[gtid];
+
     MKLOC(loc, "GOMP_parallel_end");
     KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid));
 
+
+#if OMPT_SUPPORT
+    ompt_parallel_id_t parallel_id;
+    ompt_frame_t *ompt_frame = NULL;
+
+    if (ompt_status & ompt_status_track) {
+        ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+        parallel_id = team_info->parallel_id;
+
+        ompt_frame = __ompt_get_task_frame_internal(0);
+        ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+
+#if OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+            ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                parallel_id, task_info->task_id);
+        }
+#endif
+
+        // unlink if necessary. no-op if there is not a lightweight task.
+        ompt_lw_taskteam_t *lwt = __ompt_lw_taskteam_unlink(thr);
+        // GOMP allocates/frees lwt since it can't be kept on the stack
+        if (lwt) __kmp_free(lwt);
+    }
+#endif
+
     if (! __kmp_threads[gtid]->th.th_team->t.t_serialized) {
         kmp_info_t *thr = __kmp_threads[gtid];
         __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
@@ -335,6 +517,22 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_END)(void)
     }
     else {
         __kmpc_end_serialized_parallel(&loc, gtid);
+
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            if ((ompt_status == ompt_status_track_callback) &&
+                ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+                ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                    parallel_id, task_info->task_id);
+            }
+
+            thr->th.ompt_thread_info.state =
+                (((thr->th.th_team)->t.t_serialized) ?
+                ompt_state_work_serial : ompt_state_work_parallel);
+        }
+#endif
+
     }
 }
 
@@ -635,13 +833,13 @@ LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT), \
             if (num_threads != 0) {                                          \
                 __kmp_push_num_threads(&loc, gtid, num_threads);             \
             }                                                                \
-            __kmp_GOMP_fork_call(&loc, gtid,                                 \
+            __kmp_GOMP_fork_call(&loc, gtid, task,                           \
               (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,         \
               task, data, num_threads, &loc, (schedule), lb,                 \
               (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);               \
         }                                                                    \
         else {                                                               \
-            __kmpc_serialized_parallel(&loc, gtid);                          \
+            __kmp_GOMP_serialized_parallel(&loc, gtid, task);                \
         }                                                                    \
                                                                              \
         KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                        \
@@ -707,9 +905,32 @@ xexpand(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data, void (*copy_fu
         __kmpc_omp_task(&loc, gtid, task);
     }
     else {
+#if OMPT_SUPPORT
+        ompt_thread_info_t oldInfo;
+        kmp_info_t *thread;
+        kmp_taskdata_t *taskdata;
+        if (ompt_status & ompt_status_track) {
+            // Store the threads states and restore them after the task
+            thread = __kmp_threads[ gtid ];
+            taskdata = KMP_TASK_TO_TASKDATA(task);
+            oldInfo = thread->th.ompt_thread_info;
+            thread->th.ompt_thread_info.wait_id = 0;
+            thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+            taskdata->ompt_task_info.frame.exit_runtime_frame =
+                __builtin_frame_address(0);
+        }
+#endif
+
         __kmpc_omp_task_begin_if0(&loc, gtid, task);
         func(data);
         __kmpc_omp_task_complete_if0(&loc, gtid, task);
+
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            thread->th.ompt_thread_info = oldInfo;
+            taskdata->ompt_task_info.frame.exit_runtime_frame = 0;
+        }
+#endif
     }
 
     KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid));
@@ -801,6 +1022,16 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(void (*task) (void *), void *
 {
     int gtid = __kmp_entry_gtid();
     int last = FALSE;
+
+#if OMPT_SUPPORT
+    ompt_frame_t *parent_frame;
+
+    if (ompt_status & ompt_status_track) {
+        parent_frame = __ompt_get_task_frame_internal(0);
+        parent_frame->reenter_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
     MKLOC(loc, "GOMP_parallel_sections_start");
     KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid));
 
@@ -808,15 +1039,21 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(void (*task) (void *), void *
         if (num_threads != 0) {
             __kmp_push_num_threads(&loc, gtid, num_threads);
         }
-        __kmp_GOMP_fork_call(&loc, gtid,
+        __kmp_GOMP_fork_call(&loc, gtid, task,
           (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data,
           num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1,
           (kmp_int)count, (kmp_int)1, (kmp_int)1);
     }
     else {
-        __kmpc_serialized_parallel(&loc, gtid);
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
     }
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        parent_frame->reenter_runtime_frame = NULL;
+    }
+#endif
+
     KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
 
     KA_TRACE(20, ("GOMP_parallel_sections_start exit: T#%d\n", gtid));
@@ -865,11 +1102,11 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *), void *data, unsigned n
         if(flags != 0) {
             __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
         }
-        __kmp_GOMP_fork_call(&loc, gtid,
+        __kmp_GOMP_fork_call(&loc, gtid, task,
           (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data);
     }
     else {
-        __kmpc_serialized_parallel(&loc, gtid);
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
     }
     task(data);
     xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();
@@ -891,13 +1128,13 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data,
         if(flags != 0) {
             __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
         }
-        __kmp_GOMP_fork_call(&loc, gtid,
+        __kmp_GOMP_fork_call(&loc, gtid, task,
           (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data,
           num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1,
           (kmp_int)count, (kmp_int)1, (kmp_int)1);
     }
     else {
-        __kmpc_serialized_parallel(&loc, gtid);
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
     }
 
     KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
@@ -924,13 +1161,13 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data,
             if (flags != 0) {                                                \
                 __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);    \
             }                                                                \
-            __kmp_GOMP_fork_call(&loc, gtid,                                 \
+            __kmp_GOMP_fork_call(&loc, gtid, task,                           \
               (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,         \
               task, data, num_threads, &loc, (schedule), lb,                 \
               (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);               \
         }                                                                    \
         else {                                                               \
-            __kmpc_serialized_parallel(&loc, gtid);                          \
+            __kmp_GOMP_serialized_parallel(&loc, gtid, task);                \
         }                                                                    \
                                                                              \
         KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                        \
index 6e7c59c..d33ce69 100644 (file)
@@ -1206,6 +1206,10 @@ __kmp_acquire_queuing_lock_timed_template( kmp_queuing_lock_t *lck,
     volatile kmp_uint32 *spin_here_p;
     kmp_int32 need_mf = 1;
 
+#if OMPT_SUPPORT
+    ompt_state_t prev_state = ompt_state_undefined;
+#endif
+
     KA_TRACE( 1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid ));
 
     KMP_FSYNC_PREPARE( lck );
@@ -1309,6 +1313,16 @@ __kmp_acquire_queuing_lock_timed_template( kmp_queuing_lock_t *lck,
 #ifdef DEBUG_QUEUING_LOCKS
                     TRACE_LOCK_HT( gtid+1, "acq exit: ", head, 0 );
 #endif
+
+#if OMPT_SUPPORT
+                    if ((ompt_status & ompt_status_track) &&
+                        prev_state != ompt_state_undefined) {
+                        /* change the state before clearing wait_id */
+                        this_thr->th.ompt_thread_info.state = prev_state;
+                        this_thr->th.ompt_thread_info.wait_id = 0;
+                    }
+#endif
+
                     KMP_FSYNC_ACQUIRED( lck );
                     return; /* lock holder cannot be on queue */
                 }
@@ -1317,6 +1331,16 @@ __kmp_acquire_queuing_lock_timed_template( kmp_queuing_lock_t *lck,
             break;
         }
 
+#if OMPT_SUPPORT
+        if ((ompt_status & ompt_status_track) &&
+            prev_state == ompt_state_undefined) {
+            /* this thread will spin; set wait_id before entering wait state */
+            prev_state = this_thr->th.ompt_thread_info.state;
+            this_thr->th.ompt_thread_info.wait_id = (uint64_t) lck;
+            this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
+        }
+#endif
+
         if ( enqueued ) {
             if ( tail > 0 ) {
                 kmp_info_t *tail_thr = __kmp_thread_from_gtid( tail - 1 );
@@ -1346,6 +1370,13 @@ __kmp_acquire_queuing_lock_timed_template( kmp_queuing_lock_t *lck,
 #ifdef DEBUG_QUEUING_LOCKS
             TRACE_LOCK( gtid+1, "acq exit 2" );
 #endif
+
+#if OMPT_SUPPORT
+            /* change the state before clearing wait_id */
+            this_thr->th.ompt_thread_info.state = prev_state;
+            this_thr->th.ompt_thread_info.wait_id = 0;
+#endif
+
             /* got lock, we were dequeued by the thread that released lock */
             return;
         }
@@ -1491,6 +1522,11 @@ __kmp_release_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 #ifdef DEBUG_QUEUING_LOCKS
                 TRACE_LOCK_HT( gtid+1, "rel exit: ", 0, 0 );
 #endif
+
+#if OMPT_SUPPORT
+                /* nothing to do - no other thread is trying to shift blame */
+#endif
+
                 return KMP_LOCK_RELEASED;
             }
             dequeued = FALSE;
index 1144ee5..e530011 100644 (file)
 #include "kmp_stats.h"
 #include "kmp_wait_release.h"
 
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
 /* these are temporary issues to be dealt with */
 #define KMP_USE_PRCTL 0
 #define KMP_USE_POOLED_ALLOC 0
@@ -759,6 +763,16 @@ __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
         /* TODO repleace with general release procedure */
         team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
 
+#if OMPT_SUPPORT && OMPT_BLAME
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
+            /* accept blame for "ordered" waiting */
+            kmp_info_t *this_thread = __kmp_threads[gtid];
+            ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
+                this_thread->th.ompt_thread_info.wait_id);
+        }
+#endif
+
         KMP_MB();       /* Flush all pending memory write invalidates.  */
     }
 #endif /* BUILD_PARALLEL_ORDERED */
@@ -1271,7 +1285,14 @@ __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
 
             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
 
+#if OMPT_SUPPORT
+            ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
+#endif
+
             new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
+#if OMPT_SUPPORT
+                                           ompt_parallel_id,
+#endif
 #if OMP_40_ENABLED
                                            proc_bind,
 #endif
@@ -1355,6 +1376,11 @@ __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
         }
         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
+#if OMPT_SUPPORT
+        ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
+        __ompt_team_assign_id(serial_team, ompt_parallel_id);
+#endif
+
         KMP_MB();
 
     } else {
@@ -1422,6 +1448,9 @@ __kmp_fork_call(
     int         gtid,
     enum fork_context_e  call_context, // Intel, GNU, ...
     kmp_int32   argc,
+#if OMPT_SUPPORT
+    void       *unwrapped_task,
+#endif
     microtask_t microtask,
     launch_t    invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
@@ -1477,6 +1506,21 @@ __kmp_fork_call(
     root          = master_th->th.th_root;
     master_active = root->r.r_active;
     master_set_numthreads = master_th->th.th_set_nproc;
+
+#if OMPT_SUPPORT
+    ompt_parallel_id_t ompt_parallel_id;
+    ompt_task_id_t ompt_task_id;
+    ompt_frame_t *ompt_frame;
+    ompt_task_id_t my_task_id;
+    ompt_parallel_id_t my_parallel_id;
+
+    if (ompt_status & ompt_status_track) {
+        ompt_parallel_id = __ompt_parallel_id_new(gtid);
+        ompt_task_id = __ompt_get_task_id_internal(0);
+        ompt_frame = __ompt_get_task_frame_internal(0);
+    }
+#endif
+
     // Nested level will be an index in the nested nthreads array
     level         = parent_team->t.t_level;
 #if OMP_40_ENABLED
@@ -1493,6 +1537,16 @@ __kmp_fork_call(
     }
 #endif
 
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
+        int team_size = master_set_numthreads;
+
+        ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
+            ompt_task_id, ompt_frame, ompt_parallel_id,
+            team_size, unwrapped_task);
+    }
+#endif
 
     master_th->th.th_ident = loc;
 
@@ -1519,11 +1573,77 @@ __kmp_fork_call(
             KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
             parent_team->t.t_serialized--; // AC: need this in order enquiry functions
                                            //     work correctly, will restore at join time
+
+#if OMPT_SUPPORT
+            void *dummy;
+            void **exit_runtime_p;
+
+            ompt_lw_taskteam_t lw_taskteam;
+
+            if (ompt_status & ompt_status_track) {
+                __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                    unwrapped_task, ompt_parallel_id);
+                lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+                exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
+
+                __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+
+#if OMPT_TRACE
+                /* OMPT implicit task begin */
+                my_task_id = lw_taskteam.ompt_task_info.task_id;
+                my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
+                if ((ompt_status == ompt_status_track_callback) &&
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                        my_parallel_id, my_task_id);
+                }
+#endif
+
+                /* OMPT state */
+                master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+            } else {
+                exit_runtime_p = &dummy;
+            }
+#endif
+
             KMP_TIME_BLOCK(OMP_work);
-            __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
+            __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                , exit_runtime_p
+#endif
+                );
+
+#if OMPT_SUPPORT
+            if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+                lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
+
+                if ((ompt_status == ompt_status_track_callback) &&
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                        ompt_parallel_id, ompt_task_id);
+                }
+
+                __ompt_lw_taskteam_unlink(master_th);
+                // reset clear the task id only after unlinking the task
+                lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
+#endif
+
+                if ((ompt_status == ompt_status_track_callback) &&
+                    ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                    ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                        ompt_parallel_id, ompt_task_id);
+                }
+                master_th->th.ompt_thread_info.state = ompt_state_overhead;
+            }
+#endif
             return TRUE;
         }
+
         parent_team->t.t_pkfn  = microtask;
+#if OMPT_SUPPORT
+        parent_team->t.ompt_team_info.microtask = unwrapped_task;
+#endif
         parent_team->t.t_invoke = invoker;
         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
         parent_team->t.t_active_level ++;
@@ -1620,10 +1740,70 @@ __kmp_fork_call(
                 // revert change made in __kmpc_serialized_parallel()
                 master_th->th.th_serial_team->t.t_level--;
                 // Get args from parent team for teams construct
+
+#if OMPT_SUPPORT
+                void *dummy;
+                void **exit_runtime_p;
+
+                ompt_lw_taskteam_t lw_taskteam;
+
+                if (ompt_status & ompt_status_track) {
+                    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                        unwrapped_task, ompt_parallel_id);
+                    lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+                    exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
+
+                    __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+
+#if OMPT_TRACE
+                    my_task_id = lw_taskteam.ompt_task_info.task_id;
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                            ompt_parallel_id, my_task_id);
+                    }
+#endif
+
+                    /* OMPT state */
+                    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+                } else {
+                    exit_runtime_p = &dummy;
+                }
+#endif
+
                 {
                     KMP_TIME_BLOCK(OMP_work);
-                    __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
+                    __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                        , exit_runtime_p
+#endif
+                    );
                 }
+
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+                    lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
+
+#if OMPT_TRACE
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                            ompt_parallel_id, ompt_task_id);
+                    }
+#endif
+
+                    __ompt_lw_taskteam_unlink(master_th);
+                    // reset clear the task id only after unlinking the task
+                    lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
+
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                            ompt_parallel_id, ompt_task_id);
+                    }
+                    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+                }
+#endif
             } else if ( microtask == (microtask_t)__kmp_teams_master ) {
                 KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
                 team = master_th->th.th_team;
@@ -1664,15 +1844,88 @@ __kmp_fork_call(
                     *argv++ = va_arg( ap, void * );
 #endif
                 KMP_MB();
+
+#if OMPT_SUPPORT
+                void *dummy;
+                void **exit_runtime_p;
+
+                ompt_lw_taskteam_t lw_taskteam;
+
+                if (ompt_status & ompt_status_track) {
+                    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                        unwrapped_task, ompt_parallel_id);
+                    lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+                    exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
+
+                    __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+
+#if OMPT_TRACE
+                    /* OMPT implicit task begin */
+                    my_task_id = lw_taskteam.ompt_task_info.task_id;
+                    my_parallel_id = ompt_parallel_id;
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                            my_parallel_id, my_task_id);
+                    }
+#endif
+
+                    /* OMPT state */
+                    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+                } else {
+                    exit_runtime_p = &dummy;
+                }
+#endif
+
                 {
                     KMP_TIME_BLOCK(OMP_work);
-                    __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
+                    __kmp_invoke_microtask( microtask, gtid, 0, argc, args
+#if OMPT_SUPPORT
+                        , exit_runtime_p
+#endif
+                    );
                 }
+
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+                    lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
+
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                            my_parallel_id, my_task_id);
+                    }
+#endif
+
+                    __ompt_lw_taskteam_unlink(master_th);
+                    // reset clear the task id only after unlinking the task
+                    lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
+
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                            ompt_parallel_id, ompt_task_id);
+                    }
+                    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+                }
+#endif
 #if OMP_40_ENABLED
             }
 #endif /* OMP_40_ENABLED */
         }
         else if ( call_context == fork_context_gnu ) {
+#if OMPT_SUPPORT
+            ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
+                __kmp_allocate(sizeof(ompt_lw_taskteam_t));
+            __ompt_lw_taskteam_init(lwt, master_th, gtid,
+                unwrapped_task, ompt_parallel_id);
+
+            lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
+            lwt->ompt_task_info.frame.exit_runtime_frame = 0;
+            __ompt_lw_taskteam_link(lwt, master_th);
+#endif
+
             // we were called from GNU native code
             KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
             return FALSE;
@@ -1759,6 +2012,9 @@ __kmp_fork_call(
         /* allocate a new parallel team */
         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
         team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                   ompt_parallel_id,
+#endif
 #if OMP_40_ENABLED
                                    proc_bind,
 #endif
@@ -1767,6 +2023,9 @@ __kmp_fork_call(
         /* allocate a new parallel team */
         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
         team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                   ompt_parallel_id,
+#endif
 #if OMP_40_ENABLED
                                    proc_bind,
 #endif
@@ -1781,6 +2040,9 @@ __kmp_fork_call(
     team->t.t_ident      = loc;
     team->t.t_parent     = parent_team;
     TCW_SYNC_PTR(team->t.t_pkfn, microtask);
+#if OMPT_SUPPORT
+    TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task);
+#endif
     team->t.t_invoke     = invoker;  /* TODO move this to root, maybe */
     // TODO: parent_team->t.t_level == INT_MAX ???
 #if OMP_40_ENABLED
@@ -1867,6 +2129,9 @@ __kmp_fork_call(
     __kmp_fork_team_threads( root, team, master_th, gtid );
     __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
 
+#if OMPT_SUPPORT
+    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+#endif
 
     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
 
@@ -1948,9 +2213,42 @@ __kmp_fork_call(
 
     KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
     return TRUE;
 }
 
+#if OMPT_SUPPORT
+static inline void
+__kmp_join_restore_state(
+    kmp_info_t *thread,
+    kmp_team_t *team)
+{
+    // restore state outside the region
+    thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
+        ompt_state_work_serial : ompt_state_work_parallel);
+}
+
+static inline void
+__kmp_join_ompt(
+    kmp_info_t *thread,
+    kmp_team_t *team,
+    ompt_parallel_id_t parallel_id)
+{
+    if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+        ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+            parallel_id, task_info->task_id);
+    }
+
+    __kmp_join_restore_state(thread,team);
+}
+#endif
+
 void
 __kmp_join_call(ident_t *loc, int gtid
 #if OMP_40_ENABLED
@@ -1976,6 +2274,12 @@ __kmp_join_call(ident_t *loc, int gtid
 
     master_th->th.th_ident = loc;
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
 #if KMP_DEBUG
     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
         KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
@@ -2003,6 +2307,13 @@ __kmp_join_call(ident_t *loc, int gtid
         }
 #endif /* OMP_40_ENABLED */
         __kmpc_end_serialized_parallel( loc, gtid );
+
+#if OMPT_SUPPORT
+        if (ompt_status == ompt_status_track_callback) {
+            __kmp_join_restore_state(master_th, parent_team);
+        }
+#endif
+
         return;
     }
 
@@ -2022,6 +2333,10 @@ __kmp_join_call(ident_t *loc, int gtid
 
     KMP_MB();
 
+#if OMPT_SUPPORT
+    ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
+#endif
+
 #if USE_ITT_BUILD
     if ( __itt_stack_caller_create_ptr ) {
         __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
@@ -2097,6 +2412,13 @@ __kmp_join_call(ident_t *loc, int gtid
                 }
             }
         }
+
+#if OMPT_SUPPORT
+        if (ompt_status == ompt_status_track_callback) {
+            __kmp_join_ompt(master_th, parent_team, parallel_id);
+        }
+#endif
+
         return;
     }
 #endif /* OMP_40_ENABLED */
@@ -2182,6 +2504,12 @@ __kmp_join_call(ident_t *loc, int gtid
 
     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
 
+#if OMPT_SUPPORT
+    if (ompt_status == ompt_status_track_callback) {
+        __kmp_join_ompt(master_th, parent_team, parallel_id);
+    }
+#endif
+
     KMP_MB();
     KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
 }
@@ -2814,11 +3142,15 @@ __kmp_initialize_root( kmp_root_t *root )
     /* setup the root team for this task */
     /* allocate the root team structure */
     KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
+
     root_team =
         __kmp_allocate_team(
             root,
             1,                                                         // new_nproc
             1,                                                         // max_nproc
+#if OMPT_SUPPORT
+            0, // root parallel id
+#endif
 #if OMP_40_ENABLED
             __kmp_nested_proc_bind.bind_types[0],
 #endif
@@ -2845,11 +3177,15 @@ __kmp_initialize_root( kmp_root_t *root )
     /* setup the  hot team for this task */
     /* allocate the hot team structure */
     KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
+
     hot_team =
         __kmp_allocate_team(
             root,
             1,                                                         // new_nproc
             __kmp_dflt_team_nth_ub * 2,                                // max_nproc
+#if OMPT_SUPPORT
+            0, // root parallel id
+#endif
 #if OMP_40_ENABLED
             __kmp_nested_proc_bind.bind_types[0],
 #endif
@@ -3425,7 +3761,11 @@ __kmp_register_root( int initial_thread )
     if( ! root_thread->th.th_serial_team ) {
         kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
         KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
+
         root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
+#if OMPT_SUPPORT
+          0, // root parallel id
+#endif
 #if OMP_40_ENABLED
           proc_bind_default,
 #endif
@@ -3563,6 +3903,14 @@ __kmp_reset_root(int gtid, kmp_root_t *root)
         __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
     #endif /* KMP_OS_WINDOWS */
 
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
+        int gtid = __kmp_get_gtid();
+        __ompt_thread_end(ompt_thread_initial, gtid);
+    }
+#endif
+
     TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
     __kmp_reap_thread( root->r.r_uber_thread, 1 );
 
@@ -3894,8 +4242,12 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
     {
     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
     KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
+
     new_thr->th.th_serial_team = serial_team =
         (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
+#if OMPT_SUPPORT
+                                           0, // root parallel id
+#endif
 #if OMP_40_ENABLED
                                            proc_bind_default,
 #endif
@@ -4395,6 +4747,9 @@ __kmp_partition_places( kmp_team_t *team )
 /* allocate a new team data structure to use.  take one off of the free pool if available */
 kmp_team_t *
 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+    ompt_parallel_id_t ompt_parallel_id,
+#endif
 #if OMP_40_ENABLED
     kmp_proc_bind_t new_proc_bind,
 #endif
@@ -4764,6 +5119,10 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
         }
 #endif
 
+#if OMPT_SUPPORT
+        __ompt_team_assign_id(team, ompt_parallel_id);
+#endif
+
         KMP_MB();
 
         return team;
@@ -4804,6 +5163,11 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
 
             KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
+
+#if OMPT_SUPPORT
+            __ompt_team_assign_id(team, ompt_parallel_id);
+#endif
+
             KMP_MB();
 
             return team;
@@ -4856,6 +5220,11 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
     team->t.t_proc_bind = new_proc_bind;
 #endif
 
+#if OMPT_SUPPORT
+    __ompt_team_assign_id(team, ompt_parallel_id);
+    team->t.ompt_serialized_team_info = NULL;
+#endif
+
     KMP_MB();
 
     KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
@@ -5101,6 +5470,18 @@ __kmp_launch_thread( kmp_info_t *this_thr )
         this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
     }
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+        this_thr->th.ompt_thread_info.wait_id = 0;
+        this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
+            __ompt_thread_begin(ompt_thread_worker, gtid);
+        }
+    }
+#endif
+
     /* This is the place where threads wait for work */
     while( ! TCR_4(__kmp_global.g.g_done) ) {
         KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
@@ -5109,9 +5490,21 @@ __kmp_launch_thread( kmp_info_t *this_thr )
         /* wait for work to do */
         KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
 
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            this_thr->th.ompt_thread_info.state = ompt_state_idle;
+        }
+#endif
+
         /* No tid yet since not part of a team */
         __kmp_fork_barrier( gtid, KMP_GTID_DNE );
 
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+        }
+#endif
+
         pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
 
         /* have we been allocated? */
@@ -5124,6 +5517,12 @@ __kmp_launch_thread( kmp_info_t *this_thr )
 
                 updateHWFPControl (*pteam);
 
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+                    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+                }
+#endif
+
                 KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
                 {
                     KMP_TIME_BLOCK(USER_worker_invoke);
@@ -5132,6 +5531,15 @@ __kmp_launch_thread( kmp_info_t *this_thr )
                 KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
                 KMP_ASSERT( rc );
 
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+                    /* no frame set while outside task */
+                    int tid = __kmp_tid_from_gtid(gtid);
+                    (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
+
+                    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+                }
+#endif
                 KMP_MB();
                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
@@ -5142,6 +5550,13 @@ __kmp_launch_thread( kmp_info_t *this_thr )
     }
     TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
+        __ompt_thread_end(ompt_thread_worker, gtid);
+    }
+#endif
+
     if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
         __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
     }
@@ -5480,6 +5895,9 @@ __kmp_internal_end(void)
 
 
     __kmp_cleanup();
+#if OMPT_SUPPORT
+    ompt_fini();
+#endif
 }
 
 void
@@ -6140,6 +6558,9 @@ __kmp_do_serial_initialize( void )
     KMP_MB();
 
     KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
+#if OMPT_SUPPORT
+    ompt_init();
+#endif
 }
 
 void
@@ -6284,6 +6705,9 @@ __kmp_middle_initialize( void )
     }
     __kmp_do_middle_initialize();
     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+#if OMPT_SUPPORT
+    ompt_init();
+#endif
 }
 
 void
@@ -6353,6 +6777,9 @@ __kmp_parallel_initialize( void )
     KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
 
     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+#if OMPT_SUPPORT
+    ompt_init();
+#endif
 }
 
 
@@ -6409,8 +6836,49 @@ __kmp_invoke_task_func( int gtid )
 #if INCLUDE_SSC_MARKS
     SSC_MARK_INVOKING();
 #endif
+
+#if OMPT_SUPPORT
+    void *dummy;
+    void **exit_runtime_p;
+    ompt_task_id_t my_task_id;
+    ompt_parallel_id_t my_parallel_id;
+
+    if (ompt_status & ompt_status_track) {
+        exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
+            ompt_task_info.frame.exit_runtime_frame);
+    } else {
+        exit_runtime_p = &dummy;
+    }
+
+#if OMPT_TRACE
+    my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
+    my_parallel_id = team->t.ompt_team_info.parallel_id;
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+            my_parallel_id, my_task_id);
+    }
+#endif
+#endif
+
     rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
-      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv );
+      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
+#if OMPT_SUPPORT
+      , exit_runtime_p
+#endif
+      );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                my_parallel_id, my_task_id);
+        }
+        // the implicit task is not dead yet, so we can't clear its task id here
+        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
+    }
+#endif
 
 #if USE_ITT_BUILD
     if ( __itt_stack_caller_create_ptr ) {
@@ -6442,7 +6910,10 @@ __kmp_teams_master( int gtid )
 #endif
     __kmp_fork_call( loc, gtid, fork_context_intel,
             team->t.t_argc,
-            (microtask_t)thr->th.th_teams_microtask,
+#if OMPT_SUPPORT
+            (void *)thr->th.th_teams_microtask,      // "unwrapped" task
+#endif
+            (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
             VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
             NULL );
 #if INCLUDE_SSC_MARKS
index 79eee33..375cacb 100644 (file)
 #include "kmp_stats.h"
 #include "kmp_itt.h"
 
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
 // template for type limits
 template< typename T >
 struct i_maxmin {
@@ -89,6 +93,11 @@ __kmp_for_static_init(
     register UT          trip_count;
     register kmp_team_t *team;
 
+#if OMPT_SUPPORT && OMPT_TRACE
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+#endif
+
     KMP_DEBUG_ASSERT( plastiter && plower && pupper && pstride );
     KE_TRACE( 10, ("__kmpc_for_static_init called (%d)\n", global_tid));
     #ifdef KMP_DEBUG
@@ -132,6 +141,15 @@ __kmp_for_static_init(
         }
         #endif
         KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+                team_info->parallel_id, task_info->task_id,
+                team_info->microtask);
+        }
+#endif
         return;
     }
 
@@ -168,6 +186,15 @@ __kmp_for_static_init(
         }
         #endif
         KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+                team_info->parallel_id, task_info->task_id,
+                team_info->microtask);
+        }
+#endif
         return;
     }
     nth = team->t.t_nproc;
@@ -187,6 +214,15 @@ __kmp_for_static_init(
         }
         #endif
         KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+                team_info->parallel_id, task_info->task_id,
+                team_info->microtask);
+        }
+#endif
         return;
     }
 
@@ -304,6 +340,15 @@ __kmp_for_static_init(
     }
     #endif
     KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+            team_info->parallel_id, task_info->task_id, team_info->microtask);
+    }
+#endif
+
     return;
 }
 
index 818a371..f5c6eea 100644 (file)
@@ -434,6 +434,18 @@ __kmp_task_start( kmp_int32 gtid, kmp_task_t * task, kmp_taskdata_t * current_ta
     KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n",
                   gtid, taskdata ) );
 
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
+        kmp_taskdata_t *parent = taskdata->td_parent;
+        ompt_callbacks.ompt_callback(ompt_event_task_begin)(
+            parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
+            parent ? &(parent->ompt_task_info.frame) : NULL,
+            taskdata->ompt_task_info.task_id,
+            taskdata->ompt_task_info.function);
+    }
+#endif
+
     return;
 }
 
@@ -579,6 +591,15 @@ __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_tas
     kmp_info_t * thread = __kmp_threads[ gtid ];
     kmp_int32 children = 0;
 
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_task_end)) {
+        kmp_taskdata_t *parent = taskdata->td_parent;
+        ompt_callbacks.ompt_callback(ompt_event_task_end)(
+            taskdata->ompt_task_info.task_id);
+    }
+#endif
+
     KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n",
                   gtid, taskdata, resumed_task) );
 
@@ -654,6 +675,9 @@ __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_tas
     // Free this task and then ancestor tasks if they have no children.
     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
 
+    // FIXME johnmc: I this statement should be before the last one so if an
+    // asynchronous inquiry peers into the runtime system it doesn't see the freed
+    // task as the current task
     __kmp_threads[ gtid ] -> th.th_current_task = resumed_task; // restore current_task
 
     // TODO: GEH - make sure root team implicit task is initialized properly.
@@ -783,6 +807,10 @@ __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *te
         KMP_DEBUG_ASSERT(task->td_allocated_child_tasks  == 0);
     }
 
+#if OMPT_SUPPORT
+    __kmp_task_init_ompt(task, tid);
+#endif
+
     KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n",
                   tid, team, task ) );
 }
@@ -937,6 +965,15 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
     KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
                   gtid, taskdata, taskdata->td_parent) );
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        taskdata->ompt_task_info.task_id = __ompt_task_id_new(gtid);
+        taskdata->ompt_task_info.function = (void*) task_entry;
+        taskdata->ompt_task_info.frame = (ompt_frame_t)
+            { .exit_runtime_frame = NULL, .reenter_runtime_frame = NULL };
+    }
+#endif
+
     return task;
 }
 
@@ -984,6 +1021,19 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
 
     __kmp_task_start( gtid, task, current_task );
 
+#if OMPT_SUPPORT
+    ompt_thread_info_t oldInfo;
+    kmp_info_t * thread;
+    if (ompt_status & ompt_status_track) {
+        // Store the threads states and restore them after the task
+        thread = __kmp_threads[ gtid ];
+        oldInfo = thread->th.ompt_thread_info;
+        thread->th.ompt_thread_info.wait_id = 0;
+        thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+        taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
 #if OMP_40_ENABLED
     // TODO: cancel tasks if the parallel region has also been cancelled
     // TODO: check if this sequence can be hoisted above __kmp_task_start
@@ -1017,6 +1067,14 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
     }
 #endif // OMP_40_ENABLED
 
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        thread->th.ompt_thread_info = oldInfo;
+        taskdata->ompt_task_info.frame.exit_runtime_frame = 0;
+    }
+#endif
+
     __kmp_task_finish( gtid, task, current_task );
 
     KA_TRACE(30, ("__kmp_inovke_task(exit): T#%d completed task %p, resuming task %p\n",
@@ -1073,6 +1131,13 @@ __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate
 {
     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
+            __builtin_frame_address(0);
+    }
+#endif
+
     /* Should we execute the new task or queue it?   For now, let's just always try to
        queue it.  If the queue fills up, then we'll execute it.  */
 
@@ -1084,6 +1149,11 @@ __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate
         __kmp_invoke_task( gtid, new_task, current_task );
     }
 
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 0;
+    }
+#endif
 
     return TASK_CURRENT_NOT_QUEUED;
 }
index d517af2..3073a07 100644 (file)
@@ -95,6 +95,32 @@ static inline void __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_
     th_gtid = this_thr->th.th_info.ds.ds_gtid;
     KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
 
+#if OMPT_SUPPORT && OMPT_BLAME
+    if (ompt_status == ompt_status_track_callback) {
+        if (this_thr->th.ompt_thread_info.state == ompt_state_idle){
+            if (ompt_callbacks.ompt_callback(ompt_event_idle_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_idle_begin)(th_gtid + 1);
+            }
+        } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)) {
+            KMP_DEBUG_ASSERT(this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_explicit);
+
+            ompt_lw_taskteam_t* team = this_thr->th.th_team->t.ompt_serialized_team_info;
+            ompt_parallel_id_t pId;
+            ompt_task_id_t tId;
+            if (team){
+                pId = team->ompt_team_info.parallel_id;
+                tId = team->ompt_task_info.task_id;
+            } else {
+                pId = this_thr->th.th_team->t.ompt_team_info.parallel_id;
+                tId = this_thr->th.th_current_task->ompt_task_info.task_id;
+            }
+            ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)(pId, tId);
+        }
+    }
+#endif
+
     // Setup for waiting
     KMP_INIT_YIELD(spins);
 
@@ -207,6 +233,33 @@ static inline void __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_
         }
         // TODO: If thread is done with work and times out, disband/free
     }
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if (ompt_status == ompt_status_track_callback) {
+        if (this_thr->th.ompt_thread_info.state == ompt_state_idle){
+            if (ompt_callbacks.ompt_callback(ompt_event_idle_end)) {
+                ompt_callbacks.ompt_callback(ompt_event_idle_end)(th_gtid + 1);
+            }
+        } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)) {
+            KMP_DEBUG_ASSERT(this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_explicit);
+
+            ompt_lw_taskteam_t* team = this_thr->th.th_team->t.ompt_serialized_team_info;
+            ompt_parallel_id_t pId;
+            ompt_task_id_t tId;
+            if (team){
+                pId = team->ompt_team_info.parallel_id;
+                tId = team->ompt_task_info.task_id;
+            } else {
+                pId = this_thr->th.th_team->t.ompt_team_info.parallel_id;
+                tId = this_thr->th.th_current_task->ompt_task_info.task_id;
+            }
+            ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)(pId, tId);
+        }
+    }
+#endif
+
     KMP_FSYNC_SPIN_ACQUIRED(spin);
 }
 
index c33c817..ebbd18f 100644 (file)
@@ -598,6 +598,12 @@ __kmp_unnamed_critical_addr:
                                //      temp: -8(%ebp)
                                //
        pushl %ebx              // save %ebx to use during this routine
+                               //
+#if OMPT_SUPPORT
+       movl 28(%ebp),%ebx      // get exit_frame address
+       movl %ebp,(%ebx)        // save exit_frame
+#endif
+
        movl 20(%ebp),%ebx      // Stack alignment - # args
        addl $2,%ebx            // #args +2  Always pass at least 2 args (gtid and tid)
        shll $2,%ebx            // Number of bytes used on stack: (#args+2)*4
@@ -1221,6 +1227,7 @@ KMP_LABEL(invoke_3):
 //     %edx:   tid
 //     %ecx:   argc
 //     %r8:    p_argv
+//     %r9:    &exit_frame
 //
 // locals:
 //     __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
@@ -1250,6 +1257,11 @@ __tid = -24
        KMP_CFI_OFFSET rbp,-16
        movq    %rsp,%rbp       // establish the base pointer for this routine.
        KMP_CFI_REGISTER rbp
+
+#if OMPT_SUPPORT
+       movq    %rbp, (%r9)     // save exit_frame
+#endif
+
        pushq   %rbx            // %rbx is callee-saved register
        pushq   %rsi            // Put gtid on stack so can pass &tgid to pkfn
        pushq   %rdx            // Put tid on stack so can pass &tid to pkfn
index a34434f..a4f9a38 100644 (file)
@@ -582,6 +582,9 @@ _gtid$ = 12
 _tid$ = 16
 _argc$ = 20
 _argv$ = 24
+if OMPT_SUPPORT
+_exit_frame$ = 28
+endif
 _i$ = -8
 _stk_adj$ = -16
 _vptr$ = -12
@@ -595,6 +598,10 @@ ___kmp_invoke_microtask PROC NEAR
         push    ebx
         push    esi
         push    edi
+if OMPT_SUPPORT
+        mov     eax, DWORD PTR _exit_frame$[ebp]
+        mov     DWORD PTR [eax], ebp
+endif
 ; Line 114
         mov     eax, DWORD PTR _argc$[ebp]
         mov     DWORD PTR _i$[ebp], eax
@@ -1307,6 +1314,9 @@ $_gtid   = 24
 $_tid    = 32
 $_argc   = 40
 $_p_argv = 48
+if OMPT_SUPPORT
+$_exit_frame = 56
+endif
 
 PUBLIC  __kmp_invoke_microtask
 _TEXT   SEGMENT
@@ -1322,6 +1332,10 @@ __kmp_invoke_microtask PROC FRAME ;NEAR
         lea     rbp, QWORD PTR [rsp]           ; establish the base pointer
         .setframe rbp, 0
         .ENDPROLOG
+if OMPT_SUPPORT
+        mov     rax, QWORD PTR $_exit_frame[rbp]
+        mov     QWORD PTR [rax], rbp
+endif
        mov     r10, rcx        ; save pkfn pointer for later
 
 ;; ------------------------------------------------------------