From 7045966982f29f2ee85dde282f76766262fe0c6e Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Mon, 6 Feb 2023 16:32:22 -0800 Subject: [PATCH] [NVPTX] Lower extraction of upper half of i32/i64 as partial move. This produces better SASS than right-shift + truncate and is fairly common for CUDA code that operates on __half2 values represented as opaque integer. Differential Revision: https://reviews.llvm.org/D143448 --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 19 ++++++ llvm/test/CodeGen/NVPTX/f16-instructions.ll | 3 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 9 +-- llvm/test/CodeGen/NVPTX/idioms.ll | 91 +++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b6a1394..ea4b59d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3063,8 +3063,27 @@ let hasSideEffects = false in { (ins Float64Regs:$s), "mov.b64 \t{{$d1, $d2}}, $s;", []>; + def I32toI16H : NVPTXInst<(outs Int16Regs:$high), + (ins Int32Regs:$s), + "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", + []>; + def I64toI32H : NVPTXInst<(outs Int32Regs:$high), + (ins Int64Regs:$s), + "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", + []>; } +// Using partial vectorized move produces better SASS code for extraction of +// upper/lower parts of an integer. +def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))), + (I32toI16H Int32Regs:$s)>; +def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))), + (I32toI16H Int32Regs:$s)>; +def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), + (I64toI32H Int64Regs:$s)>; +def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), + (I64toI32H Int64Regs:$s)>; + let hasSideEffects = false in { // Extract element of f16x2 register. PTX does not provide any way // to access elements of f16x2 vector directly, so we need to diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index 2ed795d..0b994a74 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -1032,8 +1032,7 @@ define half @test_copysign(half %a, half %b) #0 { ; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]]; ; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; ; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; -; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16; -; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]]; +; CHECK-DAG: mov.b32 {tmp, [[BX2:%rs[0-9]+]]}, [[BX0]]; ; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; ; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; ; CHECK: st.param.b16 [func_retval0+0], [[R]]; diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 4cbe46b..6fd7261 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -999,8 +999,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; ; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] -; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16 -; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]] +; CHECK-DAG: mov.b32 {tmp, [[R1:%rs[0-9]+]]}, [[A]]; ; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} ; CHECK: ret; define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { @@ -1291,10 +1290,8 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; ; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; ; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; -; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16; -; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16; -; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: mov.b32 {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; } +; CHECK-DAG: mov.b32 {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; } ; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; ; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; ; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll index f82dac2..c6cc752 100644 --- a/llvm/test/CodeGen/NVPTX/idioms.ll +++ b/llvm/test/CodeGen/NVPTX/idioms.ll @@ -5,6 +5,9 @@ ; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +%struct.S16 = type { i16, i16 } +%struct.S32 = type { i32, i32 } + ; CHECK-LABEL: abs_i16( define i16 @abs_i16(i16 %a) { ; CHECK: abs.s16 @@ -31,3 +34,91 @@ define i64 @abs_i64(i64 %a) { %abs = select i1 %abs.cond, i64 %a, i64 %neg ret i64 %abs } + +; CHECK-LABEL: i32_to_2xi16( +define %struct.S16 @i32_to_2xi16(i32 noundef %in) { + %low = trunc i32 %in to i16 + %high32 = lshr i32 %in, 16 + %high = trunc i32 %high32 to i16 +; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_param_0]; +; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]]; +; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]]; + %s1 = insertvalue %struct.S16 poison, i16 %low, 0 + %s = insertvalue %struct.S16 %s1, i16 %high, 1 + ret %struct.S16 %s +} + +; CHECK-LABEL: i32_to_2xi16_lh( +; Same as above, but with rearranged order of low/high parts. +define %struct.S16 @i32_to_2xi16_lh(i32 noundef %in) { + %high32 = lshr i32 %in, 16 + %high = trunc i32 %high32 to i16 + %low = trunc i32 %in to i16 +; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0]; +; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]]; +; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]]; + %s1 = insertvalue %struct.S16 poison, i16 %low, 0 + %s = insertvalue %struct.S16 %s1, i16 %high, 1 + ret %struct.S16 %s +} + + +; CHECK-LABEL: i32_to_2xi16_not( +define %struct.S16 @i32_to_2xi16_not(i32 noundef %in) { + %low = trunc i32 %in to i16 + ; Shift by any value other than 16 blocks the conversiopn to mov. + %high32 = lshr i32 %in, 15 + %high = trunc i32 %high32 to i16 +; CHECK: cvt.u16.u32 +; CHECK: shr.u32 +; CHECK: cvt.u16.u32 + %s1 = insertvalue %struct.S16 poison, i16 %low, 0 + %s = insertvalue %struct.S16 %s1, i16 %high, 1 + ret %struct.S16 %s +} + +; CHECK-LABEL: i64_to_2xi32( +define %struct.S32 @i64_to_2xi32(i64 noundef %in) { + %low = trunc i64 %in to i32 + %high64 = lshr i64 %in, 32 + %high = trunc i64 %high64 to i32 +; CHECK: ld.param.u64 %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0]; +; CHECK-DAG: cvt.u32.u64 %r{{[0-9+]}}, %[[R64]]; +; CHECK-DAG mov.b64 {tmp, %r{{[0-9+]}}}, %[[R64]]; + %s1 = insertvalue %struct.S32 poison, i32 %low, 0 + %s = insertvalue %struct.S32 %s1, i32 %high, 1 + ret %struct.S32 %s +} + +; CHECK-LABEL: i64_to_2xi32_not( +define %struct.S32 @i64_to_2xi32_not(i64 noundef %in) { + %low = trunc i64 %in to i32 + ; Shift by any value other than 32 blocks the conversiopn to mov. + %high64 = lshr i64 %in, 31 + %high = trunc i64 %high64 to i32 +; CHECK: cvt.u32.u64 +; CHECK: shr.u64 +; CHECK: cvt.u32.u64 + %s1 = insertvalue %struct.S32 poison, i32 %low, 0 + %s = insertvalue %struct.S32 %s1, i32 %high, 1 + ret %struct.S32 %s +} + +; CHECK-LABEL: i32_to_2xi16_shr( +; Make sure we do not get confused when our input itself is [al]shr. +define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){ + call void @escape_int(i32 %i); // Force %i to be loaded completely. + %i1 = ashr i32 %i, 16 + %l = trunc i32 %i1 to i16 + %h32 = ashr i32 %i1, 16 + %h = trunc i32 %h32 to i16 +; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0]; +; CHECK: shr.s32 %[[R32H:r[0-9]+]], %[[R32]], 16; +; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]]; +; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32H]]; + %s0 = insertvalue %struct.S16 poison, i16 %l, 0 + %s1 = insertvalue %struct.S16 %s0, i16 %h, 1 + ret %struct.S16 %s1 +} +declare dso_local void @escape_int(i32 noundef) + -- 2.7.4