#include "kmp_itt.h"
#include "kmp_os.h"
#include "kmp_stats.h"
-#if OMPT_SUPPORT
#include "ompt-specific.h"
-#endif
#if KMP_MIC
#include <immintrin.h>
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
team->t.t_id, i));
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
(*reduce)(this_thr->th.th_local.reduce_data,
other_threads[i]->th.th_local.reduce_data);
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
team->t.t_id, child_tid));
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
(*reduce)(this_thr->th.th_local.reduce_data,
child_thr->th.th_local.reduce_data);
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
team->t.t_id, child_tid));
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
(*reduce)(this_thr->th.th_local.reduce_data,
child_thr->th.th_local.reduce_data);
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
if (reduce) {
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
++child_tid) {
KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
(*reduce)(this_thr->th.th_local.reduce_data,
other_threads[child_tid]->th.th_local.reduce_data);
}
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
#include "kmp_itt.h"
#include "kmp_lock.h"
#include "kmp_stats.h"
-
-#if OMPT_SUPPORT
#include "ompt-specific.h"
-#endif
#define MAX_MESSAGE 512
loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
__KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+ OMPT_REDUCTION_DECL(th, global_tid);
if (packed_reduction_method == critical_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
+
__kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
retval = 1;
} else if (packed_reduction_method == empty_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
+
// usage: if team size == 1, no synchronization is required ( Intel
// platforms only )
retval = 1;
packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
+ OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
+
if (packed_reduction_method == critical_reduce_block) {
__kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+ OMPT_REDUCTION_END;
} else if (packed_reduction_method == empty_reduce_block) {
// usage: if team size == 1, no synchronization is required ( on Intel
// platforms only )
+ OMPT_REDUCTION_END;
+
} else if (packed_reduction_method == atomic_reduce_block) {
// neither master nor other workers should get here
tree_reduce_block)) {
// only master gets here
+ // OMPT: tree reduction is annotated in the barrier code
} else {
loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
__KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+ OMPT_REDUCTION_DECL(th, global_tid);
+
if (packed_reduction_method == critical_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
__kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
retval = 1;
} else if (packed_reduction_method == empty_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
// usage: if team size == 1, no synchronization is required ( Intel
// platforms only )
retval = 1;
// this barrier should be visible to a customer and to the threading profile
// tool (it's a terminating barrier on constructs if NOWAIT not specified)
+ OMPT_REDUCTION_DECL(th, global_tid);
if (packed_reduction_method == critical_reduce_block) {
__kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+ OMPT_REDUCTION_END;
+
// TODO: implicit barrier: should be exposed
#if OMPT_SUPPORT
ompt_frame_t *ompt_frame;
} else if (packed_reduction_method == empty_reduce_block) {
+ OMPT_REDUCTION_END;
+
// usage: if team size==1, no synchronization is required (Intel platforms only)
// TODO: implicit barrier: should be exposed
#define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
-#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL
#define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
#include "kmp.h"
+#if OMPT_SUPPORT
/*****************************************************************************
* forward declarations
****************************************************************************/
inline const char *ompt_get_runtime_version() {
return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
}
+#endif // OMPT_SUPPRORT
+
+// macros providing the OMPT callbacks for reduction clause
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid) \
+ ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); \
+ ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); \
+ void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+#define OMPT_REDUCTION_BEGIN \
+ if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \
+ ompt_callbacks.ompt_callback(ompt_callback_reduction)( \
+ ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, \
+ my_task_data, return_address); \
+ }
+#define OMPT_REDUCTION_END \
+ if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \
+ ompt_callbacks.ompt_callback(ompt_callback_reduction)( \
+ ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, \
+ my_task_data, return_address); \
+ }
+#else // OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)
+#define OMPT_REDUCTION_BEGIN
+#define OMPT_REDUCTION_END
+#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL
#endif
printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region\n");
+ exit(-1);
break;
}
break;
printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region\n");
+ exit(-1);
break;
}
break;
printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region_wait\n");
+ exit(-1);
break;
}
break;
printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region_wait\n");
+ exit(-1);
break;
}
break;
}
}
+static void on_ompt_callback_reduction(ompt_sync_region_t kind,
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *parallel_data,
+ ompt_data_t *task_data,
+ const void *codeptr_ra) {
+ switch (endpoint) {
+ case ompt_scope_begin:
+ printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64
+ ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+ ompt_get_thread_data()->value,
+ (parallel_data) ? parallel_data->value : 0, task_data->value,
+ codeptr_ra);
+ break;
+ case ompt_scope_end:
+ printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64
+ ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+ ompt_get_thread_data()->value,
+ (parallel_data) ? parallel_data->value : 0, task_data->value,
+ codeptr_ra);
+ break;
+ }
+}
+
static void
on_ompt_callback_flush(
ompt_data_t *thread_data,
register_callback(ompt_callback_nest_lock);
register_callback(ompt_callback_sync_region);
register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+ register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t);
register_callback(ompt_callback_control_tool);
register_callback(ompt_callback_flush);
register_callback(ompt_callback_cancel);
--- /dev/null
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+ int sum = 0;
+ int i;
+#pragma omp parallel num_threads(1)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+ for (i = 0; i < 10000; i++) {
+ sum += i;
+ }
+
+ // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+ // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+ // CHECK-SAME: codeptr_ra=
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+ // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra=
+
+ return 0;
+}
--- /dev/null
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+ int sum = 0;
+ int i;
+#pragma omp parallel num_threads(5)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+ for (i = 0; i < 10000; i++) {
+ sum += i;
+ }
+
+ // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+ // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+ // order and distribution to threads not determined
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+
+ return 0;
+}