From d6775052de6b7aaeaef17fdac1c796c2b99a91ee Mon Sep 17 00:00:00 2001
From: Pavel Kopyl <pavelkopyl@gmail.com>
Date: Mon, 9 Jan 2023 17:55:55 +0300
Subject: [PATCH] [NVPTX] Set default version of architecture to SM_30, PTX to
 6.0.

Support of variadic functions triggers an assertion on several tests
from llvm/test/CodeGen/Generic/ if nvptx64-* is specified as a default
triplet:

Support for variadic functions (unsized array parameter) introduced in
PTX ISA version 6.0 and requires target sm_30.

That happens because those tests contain variadic function calls and
default versions of both PTX ISA (3.2) and architecture (sm_20) are
below the minimally required.

There were no observable problems with these tests before adding
support of variadic functions, because nvptx backend just didn't
handle them properly generating invalid PTX code.

Differential Revision: https://reviews.llvm.org/D141054
---
 llvm/lib/Target/NVPTX/NVPTX.td                  | 6 +++---
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp        | 6 +++---
 llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll    | 2 +-
 llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll     | 2 +-
 llvm/test/CodeGen/NVPTX/sm-version.ll           | 4 +++-
 llvm/test/CodeGen/NVPTX/surf-tex.py             | 4 ++--
 llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll | 2 +-
 7 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index dcdd1286..4d4203c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -115,11 +115,11 @@ def PTX78 : SubtargetFeature<"ptx78", "PTXVersion", "78",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"sm_20", [SM20]>;
-def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_20", [SM20, PTX32]>;
+def : Proc<"sm_21", [SM21, PTX32]>;
 def : Proc<"sm_30", [SM30]>;
 def : Proc<"sm_32", [SM32, PTX40]>;
-def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_35", [SM35, PTX32]>;
 def : Proc<"sm_37", [SM37, PTX41]>;
 def : Proc<"sm_50", [SM50, PTX40]>;
 def : Proc<"sm_52", [SM52, PTX41]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index a03492a..2347f46 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -33,13 +33,13 @@ void NVPTXSubtarget::anchor() {}
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
     // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_20" : CPU);
+    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
 
     ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
 
-    // Set default to PTX 3.2 (CUDA 5.5)
+    // Set default to PTX 6.0 (CUDA 9.0)
     if (PTXVersion == 0) {
-      PTXVersion = 32;
+      PTXVersion = 60;
   }
 
   return *this;
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index c4be3b0..e8c554c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -1,7 +1,7 @@
 ; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type.
 ; Verify that __nvvm_reflect() is replaced with an appropriate value.
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 \
+; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
 ; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
index 7a5a5a7..2c9ea47 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
@@ -1,6 +1,6 @@
 ; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 \
+; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
 ; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll
index 7cb56a8..eacd7f3 100644
--- a/llvm/test/CodeGen/NVPTX/sm-version.ll
+++ b/llvm/test/CodeGen/NVPTX/sm-version.ll
@@ -32,7 +32,9 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefix=SM80
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 | FileCheck %s --check-prefix=SM86
 
-; SM30: .version 3.2
+; SM20: .version 3.2
+; SM21: .version 3.2
+; SM30: .version 6.0
 ; SM32: .version 4.0
 ; SM35: .version 3.2
 ; SM37: .version 4.1
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index c0b40f1..4e239ae 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -1,6 +1,6 @@
 # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll
-# RUN: llc %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll
-# RUN: %if ptxas %{ llc %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %}
+# RUN: llc -mcpu=sm_20 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll
+# RUN: %if ptxas %{ llc -mcpu=sm_20 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %}
 
 # We only need to run this second time for texture tests, because
 # there is a difference between unified and non-unified intrinsics.
diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
index b5c13da..65feae8 100644
--- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
+++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
-; CHECK: .target sm_20, debug
+; CHECK: .target sm_30, debug
 
 ; CHECK: .visible .func use_dbg_declare()
 ; CHECK: .local .align 8 .b8 __local_depot0[8];
-- 
2.7.4