From 7045966982f29f2ee85dde282f76766262fe0c6e Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 6 Feb 2023 16:32:22 -0800
Subject: [PATCH] [NVPTX] Lower extraction of upper half of i32/i64 as partial
 move.

This produces better SASS than right-shift + truncate and is fairly common for
CUDA code that operates on __half2 values represented as opaque integer.

Differential Revision: https://reviews.llvm.org/D143448
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 19 ++++++
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   |  3 +-
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll |  9 +--
 llvm/test/CodeGen/NVPTX/idioms.ll             | 91 +++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b6a1394..ea4b59d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3063,8 +3063,27 @@ let hasSideEffects = false in {
                              (ins Float64Regs:$s),
                              "mov.b64 \t{{$d1, $d2}}, $s;", []>;
 
+  def I32toI16H  : NVPTXInst<(outs Int16Regs:$high),
+                             (ins Int32Regs:$s),
+                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
+                             []>;
+  def I64toI32H  : NVPTXInst<(outs Int32Regs:$high),
+                             (ins Int64Regs:$s),
+                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
+                             []>;
 }
 
+// Using partial vectorized move produces better SASS code for extraction of
+// upper/lower parts of an integer.
+def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
+          (I32toI16H Int32Regs:$s)>;
+def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
+          (I32toI16H Int32Regs:$s)>;
+def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
+          (I64toI32H Int64Regs:$s)>;
+def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
+          (I64toI32H Int64Regs:$s)>;
+
 let hasSideEffects = false in {
   // Extract element of f16x2 register. PTX does not provide any way
   // to access elements of f16x2 vector directly, so we need to
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 2ed795d..0b994a74 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -1032,8 +1032,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-DAG:  mov.b32         [[B:%r[0-9]+]], [[BF]];
 ; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
 ; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[B]], -2147483648;
-; CHECK-DAG:  shr.u32         [[BX1:%r[0-9]+]], [[BX0]], 16;
-; CHECK-DAG:  cvt.u16.u32     [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK-DAG:  mov.b32         {tmp, [[BX2:%rs[0-9]+]]}, [[BX0]];
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
 ; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
 ; CHECK:      st.param.b16    [func_retval0+0], [[R]];
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 4cbe46b..6fd7261 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -999,8 +999,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
 ; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
 ; CHECK-DAG:  cvt.u16.u32     [[R0:%rs[0-9]+]], [[A]]
-; CHECK-DAG:  shr.u32         [[AH:%r[0-9]+]], [[A]], 16
-; CHECK-DAG:  cvt.u16.u32     [[R1:%rs[0-9]+]], [[AH]]
+; CHECK-DAG:  mov.b32         {tmp, [[R1:%rs[0-9]+]]}, [[A]];
 ; CHECK:      st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
 ; CHECK:      ret;
 define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
@@ -1291,10 +1290,8 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
 ; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
 ; CHECK-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
-; CHECK-DAG:  shr.u32         [[BY0:%r[0-9]+]], [[BX0]], 16;
-; CHECK-DAG:  shr.u32         [[BY1:%r[0-9]+]], [[BX1]], 16;
-; CHECK-DAG:  cvt.u16.u32     [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-DAG:  cvt.u16.u32     [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG:  mov.b32         {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; }
+; CHECK-DAG:  mov.b32         {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; }
 ; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
 ; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
 ; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll
index f82dac2..c6cc752 100644
--- a/llvm/test/CodeGen/NVPTX/idioms.ll
+++ b/llvm/test/CodeGen/NVPTX/idioms.ll
@@ -5,6 +5,9 @@
 ; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
+%struct.S16 = type { i16, i16 }
+%struct.S32 = type { i32, i32 }
+
 ; CHECK-LABEL: abs_i16(
 define i16 @abs_i16(i16 %a) {
 ; CHECK: abs.s16
@@ -31,3 +34,91 @@ define i64 @abs_i64(i64 %a) {
   %abs = select i1 %abs.cond, i64 %a, i64 %neg
   ret i64 %abs
 }
+
+; CHECK-LABEL: i32_to_2xi16(
+define %struct.S16 @i32_to_2xi16(i32 noundef %in) {
+  %low = trunc i32 %in to i16
+  %high32 = lshr i32 %in, 16
+  %high = trunc i32 %high32 to i16
+; CHECK:       ld.param.u32  %[[R32:r[0-9]+]], [i32_to_2xi16_param_0];
+; CHECK-DAG:   cvt.u16.u32   %rs{{[0-9+]}}, %[[R32]];
+; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32]];
+  %s1 = insertvalue %struct.S16 poison, i16 %low, 0
+  %s = insertvalue %struct.S16 %s1, i16 %high, 1
+  ret %struct.S16 %s
+}
+
+; CHECK-LABEL: i32_to_2xi16_lh(
+; Same as above, but with rearranged order of low/high parts.
+define %struct.S16 @i32_to_2xi16_lh(i32 noundef %in) {
+  %high32 = lshr i32 %in, 16
+  %high = trunc i32 %high32 to i16
+  %low = trunc i32 %in to i16
+; CHECK:       ld.param.u32  %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0];
+; CHECK-DAG:   cvt.u16.u32   %rs{{[0-9+]}}, %[[R32]];
+; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32]];
+  %s1 = insertvalue %struct.S16 poison, i16 %low, 0
+  %s = insertvalue %struct.S16 %s1, i16 %high, 1
+  ret %struct.S16 %s
+}
+
+
+; CHECK-LABEL: i32_to_2xi16_not(
+define %struct.S16 @i32_to_2xi16_not(i32 noundef %in) {
+  %low = trunc i32 %in to i16
+  ;  Shift by any value other than 16 blocks the conversiopn to mov.
+  %high32 = lshr i32 %in, 15
+  %high = trunc i32 %high32 to i16
+; CHECK:       cvt.u16.u32
+; CHECK:       shr.u32
+; CHECK:       cvt.u16.u32
+  %s1 = insertvalue %struct.S16 poison, i16 %low, 0
+  %s = insertvalue %struct.S16 %s1, i16 %high, 1
+  ret %struct.S16 %s
+}
+
+; CHECK-LABEL: i64_to_2xi32(
+define %struct.S32 @i64_to_2xi32(i64 noundef %in) {
+  %low = trunc i64 %in to i32
+  %high64 = lshr i64 %in, 32
+  %high = trunc i64 %high64 to i32
+; CHECK:       ld.param.u64  %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0];
+; CHECK-DAG:   cvt.u32.u64   %r{{[0-9+]}}, %[[R64]];
+; CHECK-DAG    mov.b64       {tmp, %r{{[0-9+]}}}, %[[R64]];
+  %s1 = insertvalue %struct.S32 poison, i32 %low, 0
+  %s = insertvalue %struct.S32 %s1, i32 %high, 1
+  ret %struct.S32 %s
+}
+
+; CHECK-LABEL: i64_to_2xi32_not(
+define %struct.S32 @i64_to_2xi32_not(i64 noundef %in) {
+  %low = trunc i64 %in to i32
+  ;  Shift by any value other than 32 blocks the conversiopn to mov.
+  %high64 = lshr i64 %in, 31
+  %high = trunc i64 %high64 to i32
+; CHECK:       cvt.u32.u64
+; CHECK:       shr.u64
+; CHECK:       cvt.u32.u64
+  %s1 = insertvalue %struct.S32 poison, i32 %low, 0
+  %s = insertvalue %struct.S32 %s1, i32 %high, 1
+  ret %struct.S32 %s
+}
+
+; CHECK-LABEL: i32_to_2xi16_shr(
+; Make sure we do not get confused when our input itself is [al]shr.
+define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){
+  call void @escape_int(i32 %i); // Force %i to be loaded completely.
+  %i1 = ashr i32 %i, 16
+  %l = trunc i32 %i1 to i16
+  %h32 = ashr i32 %i1, 16
+  %h = trunc i32 %h32 to i16
+; CHECK:      ld.param.u32    %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0];
+; CHECK:      shr.s32         %[[R32H:r[0-9]+]], %[[R32]], 16;
+; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32]];
+; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32H]];
+  %s0 = insertvalue %struct.S16 poison, i16 %l, 0
+  %s1 = insertvalue %struct.S16 %s0, i16 %h, 1
+  ret %struct.S16 %s1
+}
+declare dso_local void @escape_int(i32 noundef)
+
-- 
2.7.4