[libomptarget] Revert all improvements to support

author Jon Chesterfield <jonathanchesterfield@gmail.com>

Wed, 6 Nov 2019 15:43:46 +0000 (15:43 +0000)

committer Jon Chesterfield <jonathanchesterfield@gmail.com>

Wed, 6 Nov 2019 15:44:10 +0000 (15:44 +0000)
author Jon Chesterfield <jonathanchesterfield@gmail.com>
Wed, 6 Nov 2019 15:43:46 +0000 (15:43 +0000)
committer Jon Chesterfield <jonathanchesterfield@gmail.com>
Wed, 6 Nov 2019 15:44:10 +0000 (15:44 +0000)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt

index 6e7f76f..1cd13c5 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -55,7 +55,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
        src/omptarget-nvptx.cu
        src/parallel.cu
        src/reduction.cu
-      src/support.cu
        src/sync.cu
        src/task.cu
    )
@@ -89,7 +88,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
    set(BUILD_SHARED_LIBS OFF)
    set(CUDA_SEPARABLE_COMPILATION ON)
    list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory})
-  cuda_add_library(omptarget-nvptx STATIC unity.cu
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
        OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
  
    # Install device RTL under the lib destination folder.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

index f2892ac..78b04ec 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -13,6 +13,11 @@
  #include "target_impl.h"
  #include <stdio.h>
  
+// Warp ID in the CUDA block
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
  // Return true if this is the first active thread in the warp.
  INLINE static bool IsWarpMasterActiveThread() {
    unsigned long long Mask = __kmpc_impl_activemask();
@@ -62,7 +67,7 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
    DSPRINT0(DSFLAG_INIT,
             "Entering __kmpc_initialize_data_sharing_environment\n");
  
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
    DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
  
    omptarget_nvptx_TeamDescr *teamDescr =
@@ -106,7 +111,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
            (unsigned long long)SharingDefaultDataSize);
  
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
    __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
  
    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
@@ -226,7 +231,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
  
    DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
  
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
  
    if (IsEntryPoint) {
      if (IsWarpMasterActiveThread()) {
@@ -354,7 +359,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
    // This function initializes the stack pointer with the pointer to the
    // statically allocated shared memory slots. The size of a shared memory
    // slot is pre-determined to be 256 bytes.
-  if (GetThreadIdInBlock() == 0)
+  if (threadIdx.x == 0)
      data_sharing_init_stack_common();
  
    __threadfence_block();
@@ -372,7 +377,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
    PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
  
    // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = GetWarpId();
+  const unsigned WID = getWarpId();
    void *FrameP = 0;
    __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
  
@@ -462,7 +467,7 @@ EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
    // Compute the start address of the frame of each thread in the warp.
    uintptr_t FrameStartAddress =
        (uintptr_t) data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
+  FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
    return (void *)FrameStartAddress;
  }
  
@@ -477,7 +482,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
    __threadfence_block();
  
    if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = GetWarpId();
+    unsigned WID = getWarpId();
  
      // Current slot
      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h

index 3388b04..1052392 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -128,12 +128,12 @@
  
  #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
  #include <stdio.h>
-#include "support.h"
+#include "target_impl.h"
  
  template <typename... Arguments>
  NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-         (int)GetWarpId(), (int)GetLaneId(), parameters...);
+  printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
+         (int)(threadIdx.x & 0x1F), parameters...);
  }
  
  #endif
@@ -144,8 +144,9 @@ template <typename... Arguments>
  NOINLINE static void check(bool cond, const char *fmt,
                             Arguments... parameters) {
    if (!cond)
-    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-           (int)GetWarpId(), (int)GetLaneId(), parameters...);
+    printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
+           (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
+           parameters...);
    assert(cond);
  }
  
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu

index e86b1d1..20a22f4 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -364,7 +364,7 @@ EXTERN void omp_set_lock(omp_lock_t *lock) {
      for (;;) {
        now = clock();
        clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
+      if (cycles >= __OMP_SPIN * blockIdx.x) {
          break;
        }
      }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

index 6c98f01..9861504 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -385,5 +385,6 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
  ////////////////////////////////////////////////////////////////////////////////
  
  #include "omptarget-nvptxi.h"
+#include "supporti.h"
  
  #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h

index 1adabaf..de685b8 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -10,12 +10,7 @@
  //
  //===----------------------------------------------------------------------===//
  
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
  #include "target_impl.h"
-
  ////////////////////////////////////////////////////////////////////////////////
  // Execution Parameters
  ////////////////////////////////////////////////////////////////////////////////
@@ -31,70 +26,58 @@ enum RuntimeMode {
    RuntimeMask = 0x02u,
  };
  
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-DEVICE bool isGenericMode();
-DEVICE bool isSPMDMode();
-DEVICE bool isRuntimeUninitialized();
-DEVICE bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Modes based on location parameter fields
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE bool checkSPMDMode(kmp_Ident *loc);
-
-DEVICE bool checkGenericMode(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
  
  ////////////////////////////////////////////////////////////////////////////////
  // get info from machine
  ////////////////////////////////////////////////////////////////////////////////
  
  // get low level ids of resources
-DEVICE int GetThreadIdInBlock();
-DEVICE int GetBlockIdInKernel();
-DEVICE int GetNumberOfBlocksInKernel();
-DEVICE int GetNumberOfThreadsInBlock();
-DEVICE unsigned GetWarpId();
-DEVICE unsigned GetLaneId();
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+INLINE unsigned GetWarpId();
+INLINE unsigned GetLaneId();
  
  // get global ids to locate tread/team info (constant regardless of OMP)
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-DEVICE int GetMasterThreadID();
-DEVICE int GetNumberOfWorkersInTeam();
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
  
  // get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
+INLINE int GetOmpThreadId(int threadId,
                            bool isSPMDExecutionMode);    // omp_thread_num
-DEVICE int GetOmpTeamId();                              // omp_team_num
+INLINE int GetOmpTeamId();                              // omp_team_num
  
  // get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
+INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
  
  // get OpenMP number of procs
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
  
  // masters
-DEVICE int IsTeamMaster(int ompThreadId);
+INLINE int IsTeamMaster(int ompThreadId);
  
  // Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
  
  ////////////////////////////////////////////////////////////////////////////////
  // Memory
  ////////////////////////////////////////////////////////////////////////////////
  
  // safe alloc and free
-DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
-DEVICE void *SafeFree(void *ptr, const char *msg);
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
  // pad to a alignment (power of 2 only)
-DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
  #define ADD_BYTES(_addr, _bytes)                                               \
    ((void *)((char *)((void *)(_addr)) + (_bytes)))
  #define SUB_BYTES(_addr, _bytes)                                               \
@@ -103,8 +86,6 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
  ////////////////////////////////////////////////////////////////////////////////
  // Teams Reduction Scratchpad Helpers
  ////////////////////////////////////////////////////////////////////////////////
-DEVICE unsigned int *GetTeamsReductionTimestamp();
-DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
-
-#endif
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h

similarity index 96%

rename from openmp/libomptarget/deviceRTLs/nvptx/src/support.cu

rename to openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h

index 2767597..6fa8578 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -1,4 +1,4 @@
-//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===//
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
  //
  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  // See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,12 @@
  //
  //===----------------------------------------------------------------------===//
  
-#include "support.h"
-#include "debug.h"
-#include "omptarget-nvptx.h"
-
  ////////////////////////////////////////////////////////////////////////////////
  // Execution Parameters
  ////////////////////////////////////////////////////////////////////////////////
  
+#include "target_impl.h"
+
  INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
    execution_param = EMode;
    execution_param |= RMode;
@@ -106,9 +104,9 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
  
  INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
  
-INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
  
-INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
  
  ////////////////////////////////////////////////////////////////////////////////
  //
@@ -124,9 +122,7 @@ INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
  //      If NumThreads is 1024, master id is 992.
  //
  // Called in Generic Execution Mode only.
-INLINE int GetMasterThreadID() {
-  return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
-}
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
  
  // The last warp is reserved for the master; other warps are workers.
  // Called in Generic Execution Mode only.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

index e0bacab..95fe2ad 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -15,9 +15,8 @@
  #include <cuda.h>
  #include "nvptx_interface.h"
  
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
+#define INLINE __forceinline__ __device__
+#define NOINLINE __noinline__ __device__
  
  ////////////////////////////////////////////////////////////////////////////////
  // Kernel options
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu b/openmp/libomptarget/deviceRTLs/nvptx/unity.cu

deleted file mode 100644 (file)

index f5f92f3..0000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-//===------ unity.cu - Unity build of NVPTX deviceRTL ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Support compilers, specifically NVCC, which have not implemented link time
-// optimisation. This removes the runtime cost of moving inline functions into
-// source files in exchange for preventing efficient incremental builds.
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/cancel.cu"
-#include "src/critical.cu"
-#include "src/data_sharing.cu"
-#include "src/libcall.cu"
-#include "src/loop.cu"
-#include "src/omp_data.cu"
-#include "src/omptarget-nvptx.cu"
-#include "src/parallel.cu"
-#include "src/reduction.cu"
-#include "src/support.cu"
-#include "src/sync.cu"
-#include "src/task.cu"
author	Jon Chesterfield <jonathanchesterfield@gmail.com>
	Wed, 6 Nov 2019 15:43:46 +0000 (15:43 +0000)
committer	Jon Chesterfield <jonathanchesterfield@gmail.com>
	Wed, 6 Nov 2019 15:44:10 +0000 (15:44 +0000)
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/debug.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/support.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h	[moved from openmp/libomptarget/deviceRTLs/nvptx/src/support.cu with 96% similarity]	patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/unity.cu	[deleted file]	patch \| blob \| history