[libomptarget] Build DeviceRTL for amdgpu
authorJon Chesterfield <jonathanchesterfield@gmail.com>
Wed, 27 Oct 2021 23:39:37 +0000 (00:39 +0100)
committerJon Chesterfield <jonathanchesterfield@gmail.com>
Wed, 27 Oct 2021 23:41:45 +0000 (00:41 +0100)
Passes same tests as the current deviceRTL. Includes cmake change from D111987.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D112227

22 files changed:
clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
openmp/libomptarget/DeviceRTL/CMakeLists.txt
openmp/libomptarget/DeviceRTL/src/Configuration.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
openmp/libomptarget/test/mapping/data_member_ref.cpp
openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
openmp/libomptarget/test/mapping/delete_inf_refcount.c
openmp/libomptarget/test/mapping/lambda_by_value.cpp
openmp/libomptarget/test/mapping/ompx_hold/struct.c
openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
openmp/libomptarget/test/offloading/bug49021.cpp
openmp/libomptarget/test/offloading/bug49334.cpp
openmp/libomptarget/test/offloading/bug50022.cpp
openmp/libomptarget/test/offloading/global_constructor.cpp
openmp/libomptarget/test/offloading/host_as_target.c
openmp/libomptarget/test/unified_shared_memory/api.c
openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
openmp/libomptarget/test/unified_shared_memory/close_modifier.c
openmp/libomptarget/test/unified_shared_memory/shared_update.c

index 5400e261772911a0a7b0bc8ab06af2f79177aad3..b138000f8cf29312901f5f39da70c90fdc2a69c6 100644 (file)
@@ -252,7 +252,7 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
   std::string BitcodeSuffix;
   if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
                          options::OPT_fno_openmp_target_new_runtime, false))
-    BitcodeSuffix = "new-amdgcn-" + GPUArch;
+    BitcodeSuffix = "new-amdgpu-" + GPUArch;
   else
     BitcodeSuffix = "amdgcn-" + GPUArch;
 
index a4f9862fb09b3cd2cc585c61dbf94d47f24e2731..419c64d381168a02b664461cad947d174236cba8 100644 (file)
@@ -226,6 +226,5 @@ foreach(sm ${nvptx_sm_list})
 endforeach()
 
 foreach(mcpu ${amdgpu_mcpus})
-  # require D112227 or similar to enable the compilation for amdgpu
-  # compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib)
+  compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib)
 endforeach()
index 2b6f20fb1732cdbdfd2e6745ff7697443ba2d7af..f7c61dc013cf1f41f2cebacdbb442574dc7c9597 100644 (file)
@@ -20,9 +20,9 @@ using namespace _OMP;
 
 #pragma omp declare target
 
-extern uint32_t __omp_rtl_debug_kind;
+extern uint32_t __omp_rtl_debug_kind; // defined by CGOpenMPRuntimeGPU
 
-// TOOD: We want to change the name as soon as the old runtime is gone.
+// TODO: We want to change the name as soon as the old runtime is gone.
 DeviceEnvironmentTy CONSTANT(omptarget_device_environment)
     __attribute__((used));
 
index c77e766ae6ca9fc16645643f888619f3c7b42cee..33e2194b25f385b637288806b521c6978380f23e 100644 (file)
@@ -68,8 +68,23 @@ uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) {
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
-  return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
+uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering) {
+  // builtin_amdgcn_atomic_inc32 should expand to this switch when
+  // passed a runtime value, but does not do so yet. Workaround here.
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case __ATOMIC_RELAXED:
+    return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELAXED, "");
+  case __ATOMIC_ACQUIRE:
+    return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQUIRE, "");
+  case __ATOMIC_RELEASE:
+    return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELEASE, "");
+  case __ATOMIC_ACQ_REL:
+    return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQ_REL, "");
+  case __ATOMIC_SEQ_CST:
+    return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_SEQ_CST, "");
+  }
 }
 
 uint32_t SHARED(namedBarrierTracker);
@@ -126,19 +141,64 @@ void namedBarrier() {
   fence::team(__ATOMIC_RELEASE);
 }
 
+// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
+// so that it is usable within a template environment and so that a runtime
+// value of the memory order is expanded to this switch within clang/llvm.
+void fenceTeam(int Ordering) {
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case __ATOMIC_ACQUIRE:
+    return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+  case __ATOMIC_RELEASE:
+    return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+  case __ATOMIC_ACQ_REL:
+    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup");
+  case __ATOMIC_SEQ_CST:
+    return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+  }
+}
+void fenceKernel(int Ordering) {
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case __ATOMIC_ACQUIRE:
+    return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
+  case __ATOMIC_RELEASE:
+    return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent");
+  case __ATOMIC_ACQ_REL:
+    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
+  case __ATOMIC_SEQ_CST:
+    return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
+  }
+}
+void fenceSystem(int Ordering) {
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case __ATOMIC_ACQUIRE:
+    return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "");
+  case __ATOMIC_RELEASE:
+    return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "");
+  case __ATOMIC_ACQ_REL:
+    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+  case __ATOMIC_SEQ_CST:
+    return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
+  }
+}
+
 void syncWarp(__kmpc_impl_lanemask_t) {
   // AMDGCN doesn't need to sync threads in a warp
 }
 
 void syncThreads() { __builtin_amdgcn_s_barrier(); }
 
-void syncThreadsAligned() { syncThreads(); }
-
-void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
-
-void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
-
-void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); }
+// TODO: Don't have wavefront lane locks. Possibly can't have them.
+void unsetLock(omp_lock_t *) { __builtin_trap(); }
+int testLock(omp_lock_t *) { __builtin_trap(); }
+void initLock(omp_lock_t *) { __builtin_trap(); }
+void destroyLock(omp_lock_t *) { __builtin_trap(); }
+void setLock(omp_lock_t *) { __builtin_trap(); }
 
 #pragma omp end declare variant
 ///}
index 0b830f631e904d7d195b0656d67a6b4bbe867e7b..fb1e0dd2c105a0c6c205497c3b920747a2f1bc51 100644 (file)
@@ -122,3 +122,4 @@ endif()
 
 # Report to the parent scope that we are building a plugin for amdgpu
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL" PARENT_SCOPE)
index ec238907efc12efbb137fe9a3be0dc1c47537700..dff5987775eb1b5955c65fbe329a6fd21e0017b9 100644 (file)
@@ -2,6 +2,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 
index 7edd7db880fb1cfbc1da9cb148500f44c6de205f..7825d98c05c15fa56dc739e7f25d4ff45906f4ad 100644 (file)
@@ -2,6 +2,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <cstdio>
 #include <cstdlib>
index c8986dd66f2c56ab34b54653d354e5ef2b5a1460..bf2adddfccfbfa9da4e32e332f1f0da269e51d3d 100644 (file)
@@ -2,6 +2,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <cstdio>
 #include <cstdlib>
index cd67dddc664bf4b52bb747e1a830743535538f57..c6d2bda187a9838b257e2c71b8d5850944d24019 100644 (file)
@@ -2,6 +2,7 @@
 
 // fails with error message 'Unable to generate target entries' on amdgcn
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 #include <omp.h>
index 6e353244315dc2955d50cacfa94f4485c4bfa0bf..9cd38339b5cf9d907f5d0a6d7e425601b25dc5e6 100644 (file)
@@ -2,6 +2,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 #include <stdint.h>
index 2a0626b5fbae977db5b397bac20cfec24cc8a5d7..fc63e8626d013f8bdd5713d770d57c896c0d4a41 100644 (file)
@@ -3,6 +3,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <omp.h>
 #include <stdio.h>
index ddea2fb65cba58bb91d6da06085a5821b64ac0e1..485256134041536ffa7f0d1f592de973009be401 100644 (file)
@@ -2,6 +2,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 
index 040accd2eb4b7d987f36c6d34d26dbd672e03e7b..24b97bda7d57b46059eae10b5a26a57cbdef8f86 100644 (file)
@@ -2,6 +2,7 @@
 
 // amdgcn does not have printf definition
 // UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 
index 521adf230bed8e7fa5f06d710fd8d875c227d11d..1e456af7d1efdd824f221730534b2d0c0b289b8a 100644 (file)
@@ -2,6 +2,7 @@
 
 // Wrong results on amdgcn
 // UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <iostream>
 
index 0ba081555453c6982b8a4f9edd0f8eefa3f6fd94..4907d32ac9c0fd7af4132eb9e3d0a13ade3b7859 100644 (file)
@@ -2,7 +2,7 @@
 
 // Currently hangs on amdgpu
 // UNSUPPORTED: amdgcn-amd-amdhsa
-
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 // UNSUPPORTED: x86_64-pc-linux-gnu
 
 #include <cassert>
index a520442c835c50cd72d3b8f091face7a4dac0331..ca1f0e1ec3e3ef06d081c6059362b40a67b5efa5 100644 (file)
@@ -1,6 +1,7 @@
 // RUN: %libomptarget-compilexx-and-run-generic
 
 // UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <cassert>
 #include <iostream>
index d73fe1ad938f334594476dd261dbc59edbd90bd9..ae602df8c32e30a131533a6ee3e9882719443606 100644 (file)
@@ -2,6 +2,7 @@
 
 // Fails in DAGToDAG on an address space problem
 // UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <cmath>
 #include <cstdio>
index c25a4809c244f238ee6f02271f87c940c0c7fe6d..1e7cdef03caa5f7367ab4522629bc1e85855bf45 100644 (file)
@@ -9,6 +9,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 #include <omp.h>
index 7282491b2a182258bda8de55b4171c150685e5b2..fcb531808edf6b3be893738f4eefeea07f288844 100644 (file)
@@ -4,6 +4,7 @@
 
 // Fails on amdgcn with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 #include <omp.h>
index e159ed82c25cc591a0437dc4e38bcf402bab2a56..62555d2eb4d9449ddd5cffa03dbea2f120d5bd01 100644 (file)
@@ -5,6 +5,7 @@
 
 // Fails on amdgcn with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <omp.h>
 #include <stdio.h>
index 6667fd85ec53230b323ba7b9570e6d64c9cec6ec..98f1322ff2cb36b08e82256d4ed044c4995d6bb4 100644 (file)
@@ -5,6 +5,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <omp.h>
 #include <stdio.h>
index ab9b3e86f0a27f07baebdb6bd44734af669ec6db..2b90cf362ea1d72b4b96b0ad70fca140f060174f 100644 (file)
@@ -4,6 +4,7 @@
 
 // amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 #include <omp.h>