From 83ff0ff46337422171fb36f934bd56c2bc1be15c Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Thu, 13 May 2021 17:12:23 -0500
Subject: [PATCH] [Clang][OpenMP] Allow unified_shared_memory for
 Pascal-generation GPUs.

The Pascal architecture supports the page migration engine required for
unified_shared_memory, as indicated by NVIDIA:
 * https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
 * https://developer.nvidia.com/blog/beyond-gpu-memory-limits-unified-memory-pascal/
 * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements

The limitation was introduced in D54493 which justified the cut-off by
the requirement for unified addressing. However, Unified Virtual
Addressing (UVA) is already available with sm20 (Fermi, Kepler,
Maxwell):
 * https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#basics-of-uva-cuda-memory-management

Unified shared memory might even be possible with these, but with
migration of entire allocations on kernel startup.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D101595
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 8 ++++----
 clang/test/OpenMP/requires_codegen.cpp   | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index fd492ea..2ceb3a4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -4441,10 +4441,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
       case CudaArch::SM_37:
       case CudaArch::SM_50:
       case CudaArch::SM_52:
-      case CudaArch::SM_53:
-      case CudaArch::SM_60:
-      case CudaArch::SM_61:
-      case CudaArch::SM_62: {
+      case CudaArch::SM_53: {
         SmallString<256> Buffer;
         llvm::raw_svector_ostream Out(Buffer);
         Out << "Target architecture " << CudaArchToString(Arch)
@@ -4452,6 +4449,9 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
         CGM.Error(Clause->getBeginLoc(), Out.str());
         return;
       }
+      case CudaArch::SM_60:
+      case CudaArch::SM_61:
+      case CudaArch::SM_62:
       case CudaArch::SM_70:
       case CudaArch::SM_72:
       case CudaArch::SM_75:
diff --git a/clang/test/OpenMP/requires_codegen.cpp b/clang/test/OpenMP/requires_codegen.cpp
index 84821e8..1029a86 100644
--- a/clang/test/OpenMP/requires_codegen.cpp
+++ b/clang/test/OpenMP/requires_codegen.cpp
@@ -8,9 +8,9 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_50 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_52 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_53 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_60 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_61 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_62 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_60 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE_NO_ERR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_61 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE_NO_ERR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_62 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE_NO_ERR
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_70 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE_NO_ERR
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_72 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE_NO_ERR
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -target-cpu sm_75 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t-out.ll -DREGION_DEVICE_NO_ERR
-- 
2.7.4