[OpenMP] Implementation of OMPT reduction callbacks

author protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de>

Fri, 27 Dec 2019 13:39:50 +0000 (14:39 +0100)

committer protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de>

Fri, 27 Dec 2019 14:30:51 +0000 (15:30 +0100)
author protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de>
Fri, 27 Dec 2019 13:39:50 +0000 (14:39 +0100)
committer protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de>
Fri, 27 Dec 2019 14:30:51 +0000 (15:30 +0100)
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp

index e17986b..a6d87b5 100644 (file)
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -15,9 +15,7 @@
  #include "kmp_itt.h"
  #include "kmp_os.h"
  #include "kmp_stats.h"
-#if OMPT_SUPPORT
  #include "ompt-specific.h"
-#endif
  
  #if KMP_MIC
  #include <immintrin.h>
@@ -128,8 +126,11 @@ static bool __kmp_linear_barrier_gather_template(
                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
                    team->t.t_id, i));
          ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
          (*reduce)(this_thr->th.th_local.reduce_data,
                    other_threads[i]->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
          ANNOTATE_REDUCE_BEFORE(reduce);
          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
        }
@@ -355,8 +356,11 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                    team->t.t_id, child_tid));
          ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
          (*reduce)(this_thr->th.th_local.reduce_data,
                    child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
          ANNOTATE_REDUCE_BEFORE(reduce);
          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
        }
@@ -600,8 +604,11 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                    team->t.t_id, child_tid));
          ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
          (*reduce)(this_thr->th.th_local.reduce_data,
                    child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
          ANNOTATE_REDUCE_BEFORE(reduce);
          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
        }
@@ -912,6 +919,8 @@ static void __kmp_hierarchical_barrier_gather(
          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
          if (reduce) {
            ANNOTATE_REDUCE_AFTER(reduce);
+          OMPT_REDUCTION_DECL(this_thr, gtid);
+          OMPT_REDUCTION_BEGIN;
            for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
                 ++child_tid) {
              KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
@@ -923,6 +932,7 @@ static void __kmp_hierarchical_barrier_gather(
              (*reduce)(this_thr->th.th_local.reduce_data,
                        other_threads[child_tid]->th.th_local.reduce_data);
            }
+          OMPT_REDUCTION_END;
            ANNOTATE_REDUCE_BEFORE(reduce);
            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
          }
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp

index d39bf9a..ac9a935 100644 (file)
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -18,10 +18,7 @@
  #include "kmp_itt.h"
  #include "kmp_lock.h"
  #include "kmp_stats.h"
-
-#if OMPT_SUPPORT
  #include "ompt-specific.h"
-#endif
  
  #define MAX_MESSAGE 512
  
@@ -3429,13 +3426,18 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
        loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
    __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
  
+  OMPT_REDUCTION_DECL(th, global_tid);
    if (packed_reduction_method == critical_reduce_block) {
  
+    OMPT_REDUCTION_BEGIN;
+
      __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
      retval = 1;
  
    } else if (packed_reduction_method == empty_reduce_block) {
  
+    OMPT_REDUCTION_BEGIN;
+
      // usage: if team size == 1, no synchronization is required ( Intel
      // platforms only )
      retval = 1;
@@ -3536,15 +3538,20 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
  
    packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
  
+  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
+
    if (packed_reduction_method == critical_reduce_block) {
  
      __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+    OMPT_REDUCTION_END;
  
    } else if (packed_reduction_method == empty_reduce_block) {
  
      // usage: if team size == 1, no synchronization is required ( on Intel
      // platforms only )
  
+    OMPT_REDUCTION_END;
+
    } else if (packed_reduction_method == atomic_reduce_block) {
  
      // neither master nor other workers should get here
@@ -3556,6 +3563,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
                                     tree_reduce_block)) {
  
      // only master gets here
+    // OMPT: tree reduction is annotated in the barrier code
  
    } else {
  
@@ -3629,13 +3637,17 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
        loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
    __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
  
+  OMPT_REDUCTION_DECL(th, global_tid);
+
    if (packed_reduction_method == critical_reduce_block) {
  
+    OMPT_REDUCTION_BEGIN;
      __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
      retval = 1;
  
    } else if (packed_reduction_method == empty_reduce_block) {
  
+    OMPT_REDUCTION_BEGIN;
      // usage: if team size == 1, no synchronization is required ( Intel
      // platforms only )
      retval = 1;
@@ -3723,10 +3735,13 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
  
    // this barrier should be visible to a customer and to the threading profile
    // tool (it's a terminating barrier on constructs if NOWAIT not specified)
+  OMPT_REDUCTION_DECL(th, global_tid);
  
    if (packed_reduction_method == critical_reduce_block) {
      __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
  
+    OMPT_REDUCTION_END;
+
  // TODO: implicit barrier: should be exposed
  #if OMPT_SUPPORT
      ompt_frame_t *ompt_frame;
@@ -3749,6 +3764,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
  
    } else if (packed_reduction_method == empty_reduce_block) {
  
+    OMPT_REDUCTION_END;
+
  // usage: if team size==1, no synchronization is required (Intel platforms only)
  
  // TODO: implicit barrier: should be exposed
diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h

index da6a0e4..a5901b5 100644 (file)
--- a/openmp/runtime/src/ompt-event-specific.h
+++ b/openmp/runtime/src/ompt-event-specific.h
@@ -99,7 +99,7 @@
  
  #define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
  
-#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL
  
  #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
  
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h

index 47d8a16..5ba240c 100644 (file)
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -15,6 +15,7 @@
  
  #include "kmp.h"
  
+#if OMPT_SUPPORT
  /*****************************************************************************
   * forward declarations
   ****************************************************************************/
@@ -101,5 +102,30 @@ inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
  inline const char *ompt_get_runtime_version() {
    return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
  }
+#endif // OMPT_SUPPRORT
+
+// macros providing the OMPT callbacks for reduction clause
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)                                    \
+  ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr);                    \
+  ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);                \
+  void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+#define OMPT_REDUCTION_BEGIN                                                   \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data,        \
+        my_task_data, return_address);                                         \
+  }
+#define OMPT_REDUCTION_END                                                     \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_end, my_parallel_data,          \
+        my_task_data, return_address);                                         \
+  }
+#else // OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)
+#define OMPT_REDUCTION_BEGIN
+#define OMPT_REDUCTION_END
+#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL
  
  #endif
diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h

index 64463ec..a5ba897 100644 (file)
--- a/openmp/runtime/test/ompt/callback.h
+++ b/openmp/runtime/test/ompt/callback.h
@@ -358,6 +358,9 @@ on_ompt_callback_sync_region(
            printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
            break;
          case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region\n");
+          exit(-1);
            break;
        }
        break;
@@ -377,6 +380,9 @@ on_ompt_callback_sync_region(
            printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
            break;
          case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region\n");
+          exit(-1);
            break;
        }
        break;
@@ -409,6 +415,9 @@ on_ompt_callback_sync_region_wait(
            printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
            break;
          case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region_wait\n");
+          exit(-1);
            break;
        }
        break;
@@ -428,12 +437,38 @@ on_ompt_callback_sync_region_wait(
            printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
            break;
          case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region_wait\n");
+          exit(-1);
            break;
        }
        break;
    }
  }
  
+static void on_ompt_callback_reduction(ompt_sync_region_t kind,
+                                       ompt_scope_endpoint_t endpoint,
+                                       ompt_data_t *parallel_data,
+                                       ompt_data_t *task_data,
+                                       const void *codeptr_ra) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+           ompt_get_thread_data()->value,
+           (parallel_data) ? parallel_data->value : 0, task_data->value,
+           codeptr_ra);
+    break;
+  case ompt_scope_end:
+    printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+           ompt_get_thread_data()->value,
+           (parallel_data) ? parallel_data->value : 0, task_data->value,
+           codeptr_ra);
+    break;
+  }
+}
+
  static void
  on_ompt_callback_flush(
      ompt_data_t *thread_data,
@@ -784,6 +819,7 @@ int ompt_initialize(
    register_callback(ompt_callback_nest_lock);
    register_callback(ompt_callback_sync_region);
    register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t);
    register_callback(ompt_callback_control_tool);
    register_callback(ompt_callback_flush);
    register_callback(ompt_callback_cancel);
diff --git a/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c

new file mode 100644 (file)

index 0000000..ca03298
--- /dev/null
+++ b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+  int sum = 0;
+  int i;
+#pragma omp parallel num_threads(1)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+  for (i = 0; i < 10000; i++) {
+    sum += i;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+  // CHECK-SAME: codeptr_ra=
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+  // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra=
+
+  return 0;
+}
diff --git a/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c

new file mode 100644 (file)

index 0000000..2c73fe1
--- /dev/null
+++ b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c
@@ -0,0 +1,48 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+  int sum = 0;
+  int i;
+#pragma omp parallel num_threads(5)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+  for (i = 0; i < 10000; i++) {
+    sum += i;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+  // order and distribution to threads not determined
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+
+  return 0;
+}
author	protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de>
	Fri, 27 Dec 2019 13:39:50 +0000 (14:39 +0100)
committer	protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de>
	Fri, 27 Dec 2019 14:30:51 +0000 (15:30 +0100)
openmp/runtime/src/kmp_barrier.cpp		patch \| blob \| history
openmp/runtime/src/kmp_csupport.cpp		patch \| blob \| history
openmp/runtime/src/ompt-event-specific.h		patch \| blob \| history
openmp/runtime/src/ompt-specific.h		patch \| blob \| history
openmp/runtime/test/ompt/callback.h		patch \| blob \| history
openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c	[new file with mode: 0644]	patch \| blob
openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c	[new file with mode: 0644]	patch \| blob