[OpenMP] Implement task record and replay mechanism

author Chenle Yu <chenle.yu@bsc.es>

Mon, 15 May 2023 14:56:48 +0000 (09:56 -0500)

committer Jose M Monsalve Diaz <jmonsalvediaz@anl.gov>

Mon, 15 May 2023 15:00:55 +0000 (10:00 -0500)
author Chenle Yu <chenle.yu@bsc.es>
Mon, 15 May 2023 14:56:48 +0000 (09:56 -0500)
committer Jose M Monsalve Diaz <jmonsalvediaz@anl.gov>
Mon, 15 May 2023 15:00:55 +0000 (10:00 -0500)
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt

index 43299dd..2b7a3eb 100644 (file)
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -342,6 +342,10 @@ if(LIBOMP_OMPD_SUPPORT AND ((NOT LIBOMP_OMPT_SUPPORT) OR (NOT "${CMAKE_SYSTEM_NA
    set(LIBOMP_OMPD_SUPPORT FALSE)
  endif()
  
+# OMPX Taskgraph support
+# Whether to build with OMPX Taskgraph (e.g. task record & replay)
+set(LIBOMP_OMPX_TASKGRAPH FALSE CACHE BOOL "OMPX-taskgraph (task record & replay)?")
+
  # Error check hwloc support after config-ix has run
  if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
    libomp_error_say("Hwloc requested but not available")
@@ -411,6 +415,7 @@ if(${OPENMP_STANDALONE_BUILD})
    libomp_say("Use Adaptive locks   -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
    libomp_say("Use quad precision   -- ${LIBOMP_USE_QUAD_PRECISION}")
    libomp_say("Use Hwloc library    -- ${LIBOMP_USE_HWLOC}")
+  libomp_say("Use OMPX-taskgraph   -- ${LIBOMP_OMPX_TASKGRAPH}")
  endif()
  
  add_subdirectory(src)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h

index bd50d58..251a5f0 100644 (file)
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2487,6 +2487,62 @@ typedef struct {
    } ed;
  } kmp_event_t;
  
+#if OMPX_TASKGRAPH
+// Initial number of allocated nodes while recording
+#define INIT_MAPSIZE 50
+
+typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */
+  unsigned nowait : 1;
+  unsigned re_record : 1;
+  unsigned reserved : 30;
+} kmp_taskgraph_flags_t;
+
+/// Represents a TDG node
+typedef struct kmp_node_info {
+  kmp_task_t *task; // Pointer to the actual task
+  kmp_int32 *successors; // Array of the succesors ids
+  kmp_int32 nsuccessors; // Number of succesors of the node
+  std::atomic<kmp_int32>
+      npredecessors_counter; // Number of predessors on the fly
+  kmp_int32 npredecessors; // Total number of predecessors
+  kmp_int32 successors_size; // Number of allocated succesors ids
+  kmp_taskdata_t *parent_task; // Parent implicit task
+} kmp_node_info_t;
+
+/// Represent a TDG's current status
+typedef enum kmp_tdg_status {
+  KMP_TDG_NONE = 0,
+  KMP_TDG_RECORDING = 1,
+  KMP_TDG_READY = 2
+} kmp_tdg_status_t;
+
+/// Structure that contains a TDG
+typedef struct kmp_tdg_info {
+  kmp_int32 tdg_id; // Unique idenfifier of the TDG
+  kmp_taskgraph_flags_t tdg_flags; // Flags related to a TDG
+  kmp_int32 map_size; // Number of allocated TDG nodes
+  kmp_int32 num_roots; // Number of roots tasks int the TDG
+  kmp_int32 *root_tasks; // Array of tasks identifiers that are roots
+  kmp_node_info_t *record_map; // Array of TDG nodes
+  kmp_tdg_status_t tdg_status =
+      KMP_TDG_NONE; // Status of the TDG (recording, ready...)
+  std::atomic<kmp_int32> num_tasks; // Number of TDG nodes
+  kmp_bootstrap_lock_t
+      graph_lock; // Protect graph attributes when updated via taskloop_recur
+  // Taskloop reduction related
+  void *rec_taskred_data; // Data to pass to __kmpc_task_reduction_init or
+                          // __kmpc_taskred_init
+  kmp_int32 rec_num_taskred;
+} kmp_tdg_info_t;
+
+extern kmp_int32 __kmp_max_tdgs;
+extern kmp_tdg_info_t **__kmp_global_tdgs;
+extern kmp_int32 __kmp_curr_tdg_idx;
+extern kmp_int32 __kmp_successors_size;
+extern std::atomic<kmp_int32> __kmp_tdg_task_id;
+extern kmp_int32 __kmp_num_tdg;
+#endif
+
  #ifdef BUILD_TIED_TASK_STACK
  
  /* Tied Task stack definitions */
@@ -2534,7 +2590,12 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
    unsigned complete : 1; /* 1==complete, 0==not complete   */
    unsigned freed : 1; /* 1==freed, 0==allocated        */
    unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+#if OMPX_TASKGRAPH
+  unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
+  unsigned reserved31 : 6; /* reserved for library use */
+#else
    unsigned reserved31 : 7; /* reserved for library use */
+#endif
  
  } kmp_tasking_flags_t;
  
@@ -2584,6 +2645,10 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
  #if OMPT_SUPPORT
    ompt_task_info_t ompt_task_info;
  #endif
+#if OMPX_TASKGRAPH
+  bool is_taskgraph = 0; // whether the task is within a TDG
+  kmp_tdg_info_t *tdg; // used to associate task with a TDG
+#endif
    kmp_target_data_t td_target_data;
  }; // struct kmp_taskdata
  
@@ -4124,6 +4189,20 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                                  void **user_lock,
                                                  uintptr_t hint);
  
+#if OMPX_TASKGRAPH
+// Taskgraph's Record & Replay mechanism
+// __kmp_tdg_is_recording: check whether a given TDG is recording
+// status: the tdg's current status
+static inline bool __kmp_tdg_is_recording(kmp_tdg_status_t status) {
+  return status == KMP_TDG_RECORDING;
+}
+
+KMP_EXPORT kmp_int32 __kmpc_start_record_task(ident_t *loc, kmp_int32 gtid,
+                                              kmp_int32 input_flags,
+                                              kmp_int32 tdg_id);
+KMP_EXPORT void __kmpc_end_record_task(ident_t *loc, kmp_int32 gtid,
+                                       kmp_int32 input_flags, kmp_int32 tdg_id);
+#endif
  /* Interface to fast scalable reduce methods routines */
  
  KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake

index 91bb8a8..58bf641 100644 (file)
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -46,6 +46,8 @@
  #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
  #cmakedefine01 LIBOMP_OMPD_SUPPORT
  #define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT
+#cmakedefine01 LIBOMP_OMPX_TASKGRAPH
+#define OMPX_TASKGRAPH LIBOMP_OMPX_TASKGRAPH
  #cmakedefine01 LIBOMP_PROFILING_SUPPORT
  #define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
  #cmakedefine01 LIBOMP_OMPT_OPTIONAL
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp

index 0163846..9557c51 100644 (file)
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -557,4 +557,16 @@ int __kmp_nesting_mode = 0;
  int __kmp_nesting_mode_nlevels = 1;
  int *__kmp_nesting_nth_level;
  
+#if OMPX_TASKGRAPH
+// TDG record & replay
+kmp_int32 __kmp_max_tdgs = 100;
+kmp_tdg_info_t **__kmp_global_tdgs = NULL;
+kmp_int32 __kmp_curr_tdg_idx =
+    0; // Id of the current TDG being recorded or executed
+kmp_int32 __kmp_num_tdg = 0;
+kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for
+                                      // recording
+std::atomic<kmp_int32> __kmp_tdg_task_id = 0;
+#endif
  // end of file //
+
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp

index 88eb150..e6dd0ec 100644 (file)
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -1238,6 +1238,18 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value,
    K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
  } // __kmp_stg_parse_num_threads
  
+#if OMPX_TASKGRAPH
+static void __kmp_stg_parse_max_tdgs(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_max_tdgs);
+} // __kmp_stg_parse_max_tdgs
+
+static void __kmp_std_print_max_tdgs(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_tdgs);
+} // __kmp_std_print_max_tdgs
+#endif
+
  static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
                                                        char const *value,
                                                        void *data) {
@@ -5592,6 +5604,10 @@ static kmp_setting_t __kmp_stg_table[] = {
      {"LIBOMP_NUM_HIDDEN_HELPER_THREADS",
       __kmp_stg_parse_num_hidden_helper_threads,
       __kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0},
+#if OMPX_TASKGRAPH
+    {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
+     0, 0},
+#endif
  
  #if OMPT_SUPPORT
      {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp

index ee4f96f..22119a9 100644 (file)
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -218,6 +218,44 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
  static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
                                            kmp_depnode_t *sink,
                                            kmp_task_t *sink_task) {
+#if OMPX_TASKGRAPH
+  kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+  kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+  if (source->dn.task && sink_task) {
+    // Not supporting dependency between two tasks that one is within the TDG
+    // and the other is not
+    KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph);
+  }
+  if (task_sink->is_taskgraph &&
+      __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) {
+    kmp_node_info_t *source_info =
+        &task_sink->tdg->record_map[task_source->td_task_id];
+    bool exists = false;
+    for (int i = 0; i < source_info->nsuccessors; i++) {
+      if (source_info->successors[i] == task_sink->td_task_id) {
+        exists = true;
+        break;
+      }
+    }
+    if (!exists) {
+      if (source_info->nsuccessors >= source_info->successors_size) {
+        source_info->successors_size = 2 * source_info->successors_size;
+        kmp_int32 *old_succ_ids = source_info->successors;
+        kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate(
+            source_info->successors_size * sizeof(kmp_int32));
+        source_info->successors = new_succ_ids;
+        __kmp_free(old_succ_ids);
+      }
+
+      source_info->successors[source_info->nsuccessors] = task_sink->td_task_id;
+      source_info->nsuccessors++;
+
+      kmp_node_info_t *sink_info =
+          &(task_sink->tdg->record_map[task_sink->td_task_id]);
+      sink_info->npredecessors++;
+    }
+  }
+#endif
  #ifdef KMP_SUPPORT_GRAPH_OUTPUT
    kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
    // do not use sink->dn.task as that is only filled after the dependences
@@ -256,10 +294,23 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
    // link node as successor of list elements
    for (kmp_depnode_list_t *p = plist; p; p = p->next) {
      kmp_depnode_t *dep = p->node;
+#if OMPX_TASKGRAPH
+    kmp_tdg_status tdg_status = KMP_TDG_NONE;
+    if (task) {
+      kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+      if (td->is_taskgraph)
+        tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+      if (__kmp_tdg_is_recording(tdg_status))
+        __kmp_track_dependence(gtid, dep, node, task);
+    }
+#endif
      if (dep->dn.task) {
        KMP_ACQUIRE_DEPNODE(gtid, dep);
        if (dep->dn.task) {
-        __kmp_track_dependence(gtid, dep, node, task);
+#if OMPX_TASKGRAPH
+        if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+          __kmp_track_dependence(gtid, dep, node, task);
          dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
          KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                        "%p\n",
@@ -281,16 +332,42 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
    if (!sink)
      return 0;
    kmp_int32 npredecessors = 0;
+#if OMPX_TASKGRAPH
+  kmp_tdg_status tdg_status = KMP_TDG_NONE;
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  if (task) {
+    if (td->is_taskgraph)
+      tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+    if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task)
+      __kmp_track_dependence(gtid, sink, source, task);
+  }
+#endif
    if (sink->dn.task) {
      // synchronously add source to sink' list of successors
      KMP_ACQUIRE_DEPNODE(gtid, sink);
      if (sink->dn.task) {
-      __kmp_track_dependence(gtid, sink, source, task);
+#if OMPX_TASKGRAPH
+      if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+        __kmp_track_dependence(gtid, sink, source, task);
        sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
        KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                      "%p\n",
                      gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
                      KMP_TASK_TO_TASKDATA(task)));
+#if OMPX_TASKGRAPH
+      if (__kmp_tdg_is_recording(tdg_status)) {
+        kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
+        if (tdd->is_taskgraph) {
+          if (tdd->td_flags.onced)
+            // decrement npredecessors if sink->dn.task belongs to a taskgraph
+            // and
+            //  1) the task is reset to its initial state (by kmp_free_task) or
+            //  2) the task is complete but not yet reset
+            npredecessors--;
+        }
+      }
+#endif
        npredecessors++;
      }
      KMP_RELEASE_DEPNODE(gtid, sink);
@@ -595,6 +672,48 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
    kmp_info_t *thread = __kmp_threads[gtid];
    kmp_taskdata_t *current_task = thread->th.th_current_task;
  
+#if OMPX_TASKGRAPH
+  // record TDG with deps
+  if (new_taskdata->is_taskgraph &&
+      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+    kmp_tdg_info_t *tdg = new_taskdata->tdg;
+    // extend record_map if needed
+    if (new_taskdata->td_task_id >= tdg->map_size) {
+      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+      if (new_taskdata->td_task_id >= tdg->map_size) {
+        kmp_uint old_size = tdg->map_size;
+        kmp_uint new_size = old_size * 2;
+        kmp_node_info_t *old_record = tdg->record_map;
+        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+            new_size * sizeof(kmp_node_info_t));
+        KMP_MEMCPY(new_record, tdg->record_map,
+                   old_size * sizeof(kmp_node_info_t));
+        tdg->record_map = new_record;
+
+        __kmp_free(old_record);
+
+        for (kmp_int i = old_size; i < new_size; i++) {
+          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+              __kmp_successors_size * sizeof(kmp_int32));
+          new_record[i].task = nullptr;
+          new_record[i].successors = successorsList;
+          new_record[i].nsuccessors = 0;
+          new_record[i].npredecessors = 0;
+          new_record[i].successors_size = __kmp_successors_size;
+          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+        }
+        // update the size at the end, so that we avoid other
+        // threads use old_record while map_size is already updated
+        tdg->map_size = new_size;
+      }
+      __kmp_release_bootstrap_lock(&tdg->graph_lock);
+    }
+    tdg->record_map[new_taskdata->td_task_id].task = new_task;
+    tdg->record_map[new_taskdata->td_task_id].parent_task =
+        new_taskdata->td_parent;
+    KMP_ATOMIC_INC(&tdg->num_tasks);
+  }
+#endif
  #if OMPT_SUPPORT
    if (ompt_enabled.enabled) {
      if (!current_task->ompt_task_info.frame.enter_frame.ptr)
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h

index ac6174a..d2ab515 100644 (file)
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -92,6 +92,23 @@ static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
  extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start);
  
  static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
+
+#if OMPX_TASKGRAPH
+  if (task->is_taskgraph && !(__kmp_tdg_is_recording(task->tdg->tdg_status))) {
+    kmp_node_info_t *TaskInfo = &(task->tdg->record_map[task->td_task_id]);
+
+    for (int i = 0; i < TaskInfo->nsuccessors; i++) {
+      kmp_int32 successorNumber = TaskInfo->successors[i];
+      kmp_node_info_t *successor = &(task->tdg->record_map[successorNumber]);
+      kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->npredecessors_counter) - 1;
+      if (successor->task != nullptr && npredecessors == 0) {
+        __kmp_omp_task(gtid, successor->task, false);
+      }
+    }
+    return;
+  }
+#endif
+
    kmp_info_t *thread = __kmp_threads[gtid];
    kmp_depnode_t *node = task->td_depnode;
  
@@ -120,8 +137,12 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
                  gtid, task));
  
    KMP_ACQUIRE_DEPNODE(gtid, node);
-  node->dn.task =
-      NULL; // mark this task as finished, so no new dependencies are generated
+#if OMPX_TASKGRAPH
+  if (!task->is_taskgraph ||
+      (task->is_taskgraph && !__kmp_tdg_is_recording(task->tdg->tdg_status)))
+#endif
+    node->dn.task =
+        NULL; // mark this task as finished, so no new dependencies are generated
    KMP_RELEASE_DEPNODE(gtid, node);
  
    kmp_depnode_list_t *next;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp

index 7e9147e..5b6c9ed 100644 (file)
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -37,6 +37,10 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread,
  static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
                                             kmp_task_team_t *task_team);
  static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
+#if OMPX_TASKGRAPH
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
+int __kmp_taskloop_task(int gtid, void *ptask);
+#endif
  
  #ifdef BUILD_TIED_TASK_STACK
  
@@ -281,7 +285,11 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
    }
    // Check mutexinoutset dependencies, acquire locks
    kmp_depnode_t *node = tasknew->td_depnode;
+#if OMPX_TASKGRAPH
+  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#else
    if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#endif
      for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
        KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
        if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
@@ -888,12 +896,34 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
    task->data2.priority = 0;
  
    taskdata->td_flags.freed = 1;
+#if OMPX_TASKGRAPH
+  // do not free tasks in taskgraph
+  if (!taskdata->is_taskgraph) {
+#endif
  // deallocate the taskdata and shared variable blocks associated with this task
  #if USE_FAST_MEMORY
    __kmp_fast_free(thread, taskdata);
  #else /* ! USE_FAST_MEMORY */
    __kmp_thread_free(thread, taskdata);
  #endif
+#if OMPX_TASKGRAPH
+  } else {
+    taskdata->td_flags.complete = 0;
+    taskdata->td_flags.started = 0;
+    taskdata->td_flags.freed = 0;
+    taskdata->td_flags.executing = 0;
+    taskdata->td_flags.task_serial =
+        (taskdata->td_parent->td_flags.final ||
+          taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
+
+    // taskdata->td_allow_completion_event.pending_events_count = 1;
+    KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+    KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+    // start at one because counts current task and children
+    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+  }
+#endif
+
    KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
  }
  
@@ -980,6 +1010,10 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
          flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
    ret = ret ||
          KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
+#if OMPX_TASKGRAPH
+  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
+    ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
+#endif
    return ret;
  }
  
@@ -999,6 +1033,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
    kmp_info_t *thread = __kmp_threads[gtid];
    kmp_task_team_t *task_team =
        thread->th.th_task_team; // might be NULL for serial teams...
+#if OMPX_TASKGRAPH
+  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
+  bool is_taskgraph;
+#endif
  #if KMP_DEBUG
    kmp_int32 children = 0;
  #endif
@@ -1008,6 +1046,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
  
    KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
  
+#if OMPX_TASKGRAPH
+  is_taskgraph = taskdata->is_taskgraph;
+#endif
+
  // Pop task from stack if tied
  #ifdef BUILD_TIED_TASK_STACK
    if (taskdata->td_flags.tiedness == TASK_TIED) {
@@ -1114,6 +1156,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
  
    if (completed) {
      taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+    taskdata->td_flags.onced = 1; // mark the task as ran once already
+#endif
  
  #if OMPT_SUPPORT
      // This is not a detached task, we are done here
@@ -1130,7 +1175,11 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
  #endif
            KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
        KMP_DEBUG_ASSERT(children >= 0);
+#if OMPX_TASKGRAPH
+      if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
+#else
        if (taskdata->td_taskgroup)
+#endif
          KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
      } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
                               task_team->tt.tt_hidden_helper_task_encountered)) {
@@ -1169,6 +1218,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
    // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
    resumed_task->td_flags.executing = 1; // resume previous task
  
+#if OMPX_TASKGRAPH
+  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
+      taskdata->td_taskgroup) {
+    // TDG: we only release taskgroup barrier here because
+    // free_task_and_ancestors will call
+    // __kmp_free_task, which resets all task parameters such as
+    // taskdata->started, etc. If we release the barrier earlier, these
+    // parameters could be read before being reset. This is not an issue for
+    // non-TDG implementation because we never reuse a task(data) structure
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+  }
+#endif
+
    KA_TRACE(
        10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
             gtid, taskdata, resumed_task));
@@ -1285,6 +1347,9 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
    task->td_flags.executing = 1;
    task->td_flags.complete = 0;
    task->td_flags.freed = 0;
+#if OMPX_TASKGRAPH
+  task->td_flags.onced = 0;
+#endif
  
    task->td_depnode = NULL;
    task->td_last_tied = task;
@@ -1321,6 +1386,9 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) {
    if (task->td_dephash) {
      int children;
      task->td_flags.complete = 1;
+#if OMPX_TASKGRAPH
+    task->td_flags.onced = 1;
+#endif
      children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
      kmp_tasking_flags_t flags_old = task->td_flags;
      if (children == 0 && flags_old.complete == 1) {
@@ -1550,7 +1618,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
    taskdata->td_flags.executing = 0;
    taskdata->td_flags.complete = 0;
    taskdata->td_flags.freed = 0;
-
+#if OMPX_TASKGRAPH
+  taskdata->td_flags.onced = 0;
+#endif
    KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
    // start at one because counts current task and children
    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
@@ -1586,6 +1656,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
      }
    }
  
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
+      (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
+    taskdata->is_taskgraph = 1;
+    taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+  }
+#endif
    KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
                  gtid, taskdata, taskdata->td_parent));
  
@@ -1929,6 +2008,53 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                           bool serialize_immediate) {
    kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
  
+#if OMPX_TASKGRAPH
+  if (new_taskdata->is_taskgraph &&
+      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+    kmp_tdg_info_t *tdg = new_taskdata->tdg;
+    // extend the record_map if needed
+    if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
+      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+      // map_size could have been updated by another thread if recursive
+      // taskloop
+      if (new_taskdata->td_task_id >= tdg->map_size) {
+        kmp_uint old_size = tdg->map_size;
+        kmp_uint new_size = old_size * 2;
+        kmp_node_info_t *old_record = tdg->record_map;
+        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+            new_size * sizeof(kmp_node_info_t));
+
+        KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
+        tdg->record_map = new_record;
+
+        __kmp_free(old_record);
+
+        for (kmp_int i = old_size; i < new_size; i++) {
+          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+              __kmp_successors_size * sizeof(kmp_int32));
+          new_record[i].task = nullptr;
+          new_record[i].successors = successorsList;
+          new_record[i].nsuccessors = 0;
+          new_record[i].npredecessors = 0;
+          new_record[i].successors_size = __kmp_successors_size;
+          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+        }
+        // update the size at the end, so that we avoid other
+        // threads use old_record while map_size is already updated
+        tdg->map_size = new_size;
+      }
+      __kmp_release_bootstrap_lock(&tdg->graph_lock);
+    }
+    // record a task
+    if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
+      tdg->record_map[new_taskdata->td_task_id].task = new_task;
+      tdg->record_map[new_taskdata->td_task_id].parent_task =
+          new_taskdata->td_parent;
+      KMP_ATOMIC_INC(&tdg->num_tasks);
+    }
+  }
+#endif
+
    /* Should we execute the new task or queue it? For now, let's just always try
       to queue it.  If the queue fills up, then we'll execute it.  */
    if (new_taskdata->td_flags.proxy == TASK_PROXY ||
@@ -2177,7 +2303,6 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
        taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
      }
  #endif // OMPT_SUPPORT && OMPT_OPTIONAL
-
    }
  
    KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
@@ -2445,6 +2570,17 @@ the reduction either does not use omp_orig object, or the omp_orig is accessible
  without help of the runtime library.
  */
  void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    this_tdg->rec_taskred_data =
+        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+    this_tdg->rec_num_taskred = num;
+    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+               sizeof(kmp_task_red_input_t) * num);
+  }
+#endif
    return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
  }
  
@@ -2461,6 +2597,17 @@ Note: this entry supposes the optional compiler-generated initializer routine
  has two parameters, pointer to object to be initialized and pointer to omp_orig
  */
  void *__kmpc_taskred_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    this_tdg->rec_taskred_data =
+        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+    this_tdg->rec_num_taskred = num;
+    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+               sizeof(kmp_task_red_input_t) * num);
+  }
+#endif
    return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
  }
  
@@ -2507,6 +2654,18 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
    kmp_int32 num = tg->reduce_num_data;
    kmp_int32 tid = thread->th.th_info.ds.ds_tid;
  
+#if OMPX_TASKGRAPH
+  if ((thread->th.th_current_task->is_taskgraph) &&
+      (!__kmp_tdg_is_recording(
+          __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
+    tg = thread->th.th_current_task->td_taskgroup;
+    KMP_ASSERT(tg != NULL);
+    KMP_ASSERT(tg->reduce_data != NULL);
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
+  }
+#endif
+
    KMP_ASSERT(data != NULL);
    while (tg != NULL) {
      for (int i = 0; i < num; ++i) {
@@ -4278,6 +4437,9 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
    KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
  
    taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+  taskdata->td_flags.onced = 1;
+#endif
  
    if (taskdata->td_taskgroup)
      KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
@@ -4476,8 +4638,14 @@ void __kmp_fulfill_event(kmp_event_t *event) {
  //
  // thread:   allocating thread
  // task_src: pointer to source task to be duplicated
+// taskloop_recur: used only when dealing with taskgraph,
+//      indicating whether we need to update task->td_task_id
  // returns:  a pointer to the allocated kmp_task_t structure (task).
-kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
+#if OMPX_TASKGRAPH
+                                 , int taskloop_recur
+#endif
+) {
    kmp_task_t *task;
    kmp_taskdata_t *taskdata;
    kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
@@ -4505,7 +4673,15 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
    task = KMP_TASKDATA_TO_TASK(taskdata);
  
    // Initialize new task (only specific fields not affected by memcpy)
+#if OMPX_TASKGRAPH
+  if (!taskdata->is_taskgraph || taskloop_recur)
+    taskdata->td_task_id = KMP_GEN_TASK_ID();
+  else if (taskdata->is_taskgraph &&
+           __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
+    taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+#else
    taskdata->td_task_id = KMP_GEN_TASK_ID();
+#endif
    if (task->shareds != NULL) { // need setup shareds pointer
      shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
      task->shareds = &((char *)taskdata)[shareds_offset];
@@ -4732,7 +4908,13 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
            lastpriv = 1;
        }
      }
+
+#if OMPX_TASKGRAPH
+    next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
+#else
      next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+#endif
+
      kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
      kmp_taskloop_bounds_t next_task_bounds =
          kmp_taskloop_bounds_t(next_task, task_bounds);
@@ -4929,7 +5111,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
    lb1 = ub0 + st;
  
    // create pattern task for 2nd half of the loop
+#if OMPX_TASKGRAPH
+  next_task = __kmp_task_dup_alloc(thread, task,
+                                   /* taskloop_recur */ 1);
+#else
    next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+#endif
    // adjust lower bound (upper bound is not changed) for the 2nd half
    *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
    if (ptask_dup != NULL) // construct firstprivates, etc.
@@ -4962,6 +5149,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
    p->codeptr_ra = codeptr_ra;
  #endif
  
+#if OMPX_TASKGRAPH
+  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
+  new_task_data->tdg = taskdata->tdg;
+  new_task_data->is_taskgraph = 0;
+#endif
+
  #if OMPT_SUPPORT
    // schedule new task with correct return address for OMPT events
    __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
@@ -5001,6 +5194,9 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
      __kmpc_taskgroup(loc, gtid);
    }
  
+#if OMPX_TASKGRAPH
+  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
+#endif
    // =========================================================================
    // calculate loop parameters
    kmp_taskloop_bounds_t task_bounds(task, lb, ub);
@@ -5248,3 +5444,221 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
  
    return taskdata->td_task_team != NULL;
  }
+
+#if OMPX_TASKGRAPH
+// __kmp_find_tdg: identify a TDG through its ID
+// gtid:   Global Thread ID
+// tdg_id: ID of the TDG
+// returns: If a TDG corresponding to this ID is found and not
+// its initial state, return the pointer to it, otherwise nullptr
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
+  kmp_tdg_info_t *res = nullptr;
+  if (__kmp_max_tdgs == 0)
+    return res;
+
+  if (__kmp_global_tdgs == NULL)
+    __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
+        sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
+
+  if ((__kmp_global_tdgs[tdg_id]) &&
+      (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
+    res = __kmp_global_tdgs[tdg_id];
+  return res;
+}
+
+// __kmp_start_record: launch the execution of a previous
+// recorded TDG
+// gtid:   Global Thread ID
+// tdg:    ID of the TDG
+void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
+  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
+                tdg->tdg_id, tdg->num_roots));
+  kmp_node_info_t *this_record_map = tdg->record_map;
+  kmp_int32 *this_root_tasks = tdg->root_tasks;
+  kmp_int32 this_num_roots = tdg->num_roots;
+  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+
+  if (tdg->rec_taskred_data) {
+    __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
+  }
+
+  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
+    kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
+
+    td->td_parent = parent_task;
+    this_record_map[j].parent_task = parent_task;
+
+    kmp_taskgroup_t *parent_taskgroup =
+        this_record_map[j].parent_task->td_taskgroup;
+
+    KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
+                      this_record_map[j].npredecessors);
+    KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
+
+    if (parent_taskgroup) {
+      KMP_ATOMIC_INC(&parent_taskgroup->count);
+      // The taskgroup is different so we must update it
+      td->td_taskgroup = parent_taskgroup;
+    } else if (td->td_taskgroup != nullptr) {
+      // If the parent doesnt have a taskgroup, remove it from the task
+      td->td_taskgroup = nullptr;
+    }
+    if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
+      KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
+  }
+
+  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
+    __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
+  }
+  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
+                tdg->tdg_id, tdg->num_roots));
+}
+
+// __kmp_start_record: set up a TDG structure and turn the
+// recording flag to true
+// gtid:        Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id:      ID of the TDG to record
+static inline void __kmp_start_record(kmp_int32 gtid,
+                                      kmp_taskgraph_flags_t *flags,
+                                      kmp_int32 tdg_id) {
+  kmp_tdg_info_t *tdg =
+      (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
+  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
+  // Initializing the TDG structure
+  tdg->tdg_id = tdg_id;
+  tdg->map_size = INIT_MAPSIZE;
+  tdg->num_roots = -1;
+  tdg->root_tasks = nullptr;
+  tdg->tdg_status = KMP_TDG_RECORDING;
+  tdg->rec_num_taskred = 0;
+  tdg->rec_taskred_data = nullptr;
+  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
+
+  // Initializing the list of nodes in this TDG
+  kmp_node_info_t *this_record_map =
+      (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
+  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
+    kmp_int32 *successorsList =
+        (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
+    this_record_map[i].task = nullptr;
+    this_record_map[i].successors = successorsList;
+    this_record_map[i].nsuccessors = 0;
+    this_record_map[i].npredecessors = 0;
+    this_record_map[i].successors_size = __kmp_successors_size;
+    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
+  }
+
+  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
+}
+
+// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
+// the beginning of the record process of a task region
+// loc_ref:     Location of TDG, not used yet
+// gtid:        Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id:      ID of the TDG to record, for now, incremental integer
+// returns:     1 if we record, otherwise, 0
+kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 input_flags, kmp_int32 tdg_id) {
+
+  kmp_int32 res;
+  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
+  KA_TRACE(10,
+           ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
+            gtid, loc_ref, input_flags, tdg_id));
+
+  if (__kmp_max_tdgs == 0) {
+    KA_TRACE(
+        10,
+        ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
+         "__kmp_max_tdgs = 0\n",
+         gtid, loc_ref, input_flags, tdg_id));
+    return 1;
+  }
+
+  __kmpc_taskgroup(loc_ref, gtid);
+  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
+    // TODO: use re_record flag
+    __kmp_exec_tdg(gtid, tdg);
+    res = 0;
+  } else {
+    __kmp_curr_tdg_idx = tdg_id;
+    KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
+    __kmp_start_record(gtid, flags, tdg_id);
+    __kmp_num_tdg++;
+    res = 1;
+  }
+  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
+                gtid, tdg_id, res ? "record" : "execute"));
+  return res;
+}
+
+// __kmp_end_record: set up a TDG after recording it
+// gtid:   Global thread ID
+// tdg:    Pointer to the TDG
+void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+  // Store roots
+  kmp_node_info_t *this_record_map = tdg->record_map;
+  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+  kmp_int32 *this_root_tasks =
+      (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
+  kmp_int32 this_map_size = tdg->map_size;
+  kmp_int32 this_num_roots = 0;
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+    if (this_record_map[i].npredecessors == 0) {
+      this_root_tasks[this_num_roots++] = i;
+    }
+  }
+
+  // Update with roots info and mapsize
+  tdg->map_size = this_map_size;
+  tdg->num_roots = this_num_roots;
+  tdg->root_tasks = this_root_tasks;
+  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
+  tdg->tdg_status = KMP_TDG_READY;
+
+  if (thread->th.th_current_task->td_dephash) {
+    __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
+    thread->th.th_current_task->td_dephash = NULL;
+  }
+
+  // Reset predecessor counter
+  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
+                      this_record_map[i].npredecessors);
+  }
+  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
+}
+
+// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
+// the end of recording phase
+//
+// loc_ref:      Source location information
+// gtid:         Global thread ID
+// input_flags:  Flags attached to the graph
+// tdg_id:       ID of the TDG just finished recording
+void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
+                            kmp_int32 input_flags, kmp_int32 tdg_id) {
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
+
+  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
+                " tdg=%d with flags=%d\n",
+                gtid, loc_ref, tdg_id, input_flags));
+  if (__kmp_max_tdgs) {
+    // TODO: use input_flags->nowait
+    __kmpc_end_taskgroup(loc_ref, gtid);
+    if (__kmp_tdg_is_recording(tdg->tdg_status))
+      __kmp_end_record(gtid, tdg);
+  }
+  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
+                " tdg=%d, its status is now READY\n",
+                gtid, loc_ref, tdg_id));
+}
+#endif
diff --git a/openmp/runtime/test/CMakeLists.txt b/openmp/runtime/test/CMakeLists.txt

index 05b517f..a779080 100644 (file)
--- a/openmp/runtime/test/CMakeLists.txt
+++ b/openmp/runtime/test/CMakeLists.txt
@@ -30,6 +30,7 @@ update_test_compiler_features()
  pythonize_bool(LIBOMP_USE_HWLOC)
  pythonize_bool(LIBOMP_OMPT_SUPPORT)
  pythonize_bool(LIBOMP_OMPT_OPTIONAL)
+pythonize_bool(LIBOMP_OMPX_TASKGRAPH)
  pythonize_bool(LIBOMP_HAVE_LIBM)
  pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
  pythonize_bool(OPENMP_STANDALONE_BUILD)
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg

index b1a5b7a..a939928 100644 (file)
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -99,6 +99,9 @@ if config.has_ompt:
      # for callback.h
      config.test_flags += " -I " + config.test_source_root + "/ompt"
  
+if config.has_ompx_taskgraph:
+    config.available_features.add("ompx_taskgraph")
+
  if 'Linux' in config.operating_system:
      config.available_features.add("linux")
  
diff --git a/openmp/runtime/test/lit.site.cfg.in b/openmp/runtime/test/lit.site.cfg.in

index 45a18b4..d6c2592 100644 (file)
--- a/openmp/runtime/test/lit.site.cfg.in
+++ b/openmp/runtime/test/lit.site.cfg.in
@@ -15,6 +15,7 @@ config.operating_system = "@CMAKE_SYSTEM_NAME@"
  config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
  config.using_hwloc = @LIBOMP_USE_HWLOC@
  config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
+config.has_ompx_taskgraph = @LIBOMP_OMPX_TASKGRAPH@
  config.has_libm = @LIBOMP_HAVE_LIBM@
  config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
  config.is_standalone_build = @OPENMP_STANDALONE_BUILD@
diff --git a/openmp/runtime/test/tasking/omp_record_replay.cpp b/openmp/runtime/test/tasking/omp_record_replay.cpp

new file mode 100644 (file)

index 0000000..69ad980
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay.cpp
@@ -0,0 +1,48 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+#define NT 100
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+  int __kmpc_global_thread_num(ident_t *);
+  int __kmpc_start_record_task(ident_t *, int, int, int);
+  void __kmpc_end_record_task(ident_t *, int, int , int);
+}
+#endif
+
+void func(int *num_exec) {
+  (*num_exec)++;
+}
+
+int main() {
+  int num_exec = 0;
+  int num_tasks = 0;
+  int x=0;
+  #pragma omp parallel
+  #pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+    int gtid = __kmpc_global_thread_num(nullptr);
+    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0);
+    if (res) {
+      num_tasks++;
+      #pragma omp task 
+      func(&num_exec);
+    }
+    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+  }
+
+  assert(num_tasks==1);
+  assert(num_exec==NT);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_deps.cpp b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp

new file mode 100644 (file)

index 0000000..9b6b370
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+#define NT 100
+#define MULTIPLIER 100
+#define DECREMENT 5
+
+int val;
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+  int __kmpc_global_thread_num(ident_t *);
+  int __kmpc_start_record_task(ident_t *, int, int, int);
+  void __kmpc_end_record_task(ident_t *, int, int, int);
+}
+#endif
+
+void sub() {
+  #pragma omp atomic
+  val -= DECREMENT;
+}
+
+void add() {
+  #pragma omp atomic
+  val += DECREMENT;
+}
+
+void mult() {
+  // no atomicity needed, can only be executed by 1 thread
+  // and no concurrency with other tasks possible
+  val *= MULTIPLIER;
+}
+
+int main() {
+  val = 0;
+  int *x, *y;
+  #pragma omp parallel
+  #pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+    int gtid = __kmpc_global_thread_num(nullptr);
+    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+    if (res) {
+      #pragma omp task depend(out:y)
+      add();
+      #pragma omp task depend(out:x)
+      sub();
+      #pragma omp task depend(in:x,y)
+      mult();
+    }
+    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+  }
+  assert(val==0);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp

new file mode 100644 (file)

index 0000000..0325284
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp
@@ -0,0 +1,76 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+#define NT 20
+#define MULTIPLIER 100
+#define DECREMENT 5
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+int val;
+#ifdef __cplusplus
+extern "C" {
+  int __kmpc_global_thread_num(ident_t *);
+  int __kmpc_start_record_task(ident_t *, int, int, int);
+  void __kmpc_end_record_task(ident_t *, int, int , int);
+}
+#endif
+
+void sub() {
+  #pragma omp atomic
+  val -= DECREMENT;
+}
+
+void add() {
+  #pragma omp atomic
+  val += DECREMENT;
+}
+
+void mult() {
+  // no atomicity needed, can only be executed by 1 thread
+  // and no concurrency with other tasks possible
+  val *= MULTIPLIER;
+}
+
+int main() {
+  int num_tasks = 0;
+  int *x, *y;
+  #pragma omp parallel
+  #pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+    int gtid = __kmpc_global_thread_num(nullptr);
+    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0);
+    if (res) {
+      num_tasks++;
+      #pragma omp task depend(out:y)
+      add();
+      #pragma omp task depend(out:x)
+      sub();
+      #pragma omp task depend(in:x,y)
+      mult();
+    }
+    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+    res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */1);
+    if (res) {
+      num_tasks++;
+      #pragma omp task depend(out:y)
+      add();
+      #pragma omp task depend(out:x)
+      sub();
+      #pragma omp task depend(in:x,y)
+      mult();
+    }
+    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */1);
+  }
+
+  assert(num_tasks==2);
+  assert(val==0);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp

new file mode 100644 (file)

index 0000000..3d88fae
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp
@@ -0,0 +1,50 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+
+#define NT 20
+#define N 128*128
+
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+  int __kmpc_global_thread_num(ident_t *);
+  int __kmpc_start_record_task(ident_t *, int, int, int);
+  void __kmpc_end_record_task(ident_t *, int, int , int);
+}
+#endif
+
+int main() {
+  int num_tasks = 0;
+
+  int array[N];
+  for (int i = 0; i < N; ++i)
+    array[i] = 1;
+
+  long sum = 0;
+  #pragma omp parallel
+  #pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+    int gtid = __kmpc_global_thread_num(nullptr);
+    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0,  /* tdg_id */0);
+    if (res) {
+      num_tasks++;
+      #pragma omp taskloop reduction(+:sum) num_tasks(4096)
+      for (int i = 0; i < N; ++i) {
+        sum += array[i];
+      }
+    }
+    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0,  /* tdg_id */0);
+  }
+  assert(sum==N*NT);
+  assert(num_tasks==1);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
author	Chenle Yu <chenle.yu@bsc.es>
	Mon, 15 May 2023 14:56:48 +0000 (09:56 -0500)
committer	Jose M Monsalve Diaz <jmonsalvediaz@anl.gov>
	Mon, 15 May 2023 15:00:55 +0000 (10:00 -0500)
openmp/runtime/CMakeLists.txt		patch \| blob \| history
openmp/runtime/src/kmp.h		patch \| blob \| history
openmp/runtime/src/kmp_config.h.cmake		patch \| blob \| history
openmp/runtime/src/kmp_global.cpp		patch \| blob \| history
openmp/runtime/src/kmp_settings.cpp		patch \| blob \| history
openmp/runtime/src/kmp_taskdeps.cpp		patch \| blob \| history
openmp/runtime/src/kmp_taskdeps.h		patch \| blob \| history
openmp/runtime/src/kmp_tasking.cpp		patch \| blob \| history
openmp/runtime/test/CMakeLists.txt		patch \| blob \| history
openmp/runtime/test/lit.cfg		patch \| blob \| history
openmp/runtime/test/lit.site.cfg.in		patch \| blob \| history
openmp/runtime/test/tasking/omp_record_replay.cpp	[new file with mode: 0644]	patch \| blob
openmp/runtime/test/tasking/omp_record_replay_deps.cpp	[new file with mode: 0644]	patch \| blob
openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp	[new file with mode: 0644]	patch \| blob
openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp	[new file with mode: 0644]	patch \| blob