From 82d330e0e04a55ee95dc93980761545a01543fde Mon Sep 17 00:00:00 2001 From: Jingu Kang Date: Thu, 15 Jun 2023 17:19:35 +0100 Subject: [PATCH] [AArch64] Try to convert vector shift operation into vector add operation The vector shift instructions tend to be worse than ADD/SUB on AArch64 cores so this patch supports tablegen patterns for below simple transformation. x << 1 ==> x + x Differential Revision: https://reviews.llvm.org/D153049 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +++ llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll | 4 +- llvm/test/CodeGen/AArch64/arm64-vshift.ll | 24 ++-- llvm/test/CodeGen/AArch64/rax1.ll | 2 +- llvm/test/CodeGen/AArch64/shl-to-add.ll | 132 +++++++++++++++++++++ .../CodeGen/AArch64/urem-seteq-illegal-types.ll | 2 +- .../vector_splat-const-shift-of-constmasked.ll | 18 +-- 7 files changed, 171 insertions(+), 25 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/shl-to-add.ll diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fcbb6fe..29d023b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7020,6 +7020,20 @@ defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf", defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", BinOpFrag<(trunc (AArch64roundingvlshr node:$LHS, node:$RHS))>>; defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; + +// X << 1 ==> X + X +class SHLToADDPat + : Pat<(ty (AArch64vshl (ty regtype:$Rn), (i32 1))), + (!cast("ADD"#ty) regtype:$Rn, regtype:$Rn)>; + +def : SHLToADDPat; +def : SHLToADDPat; +def : SHLToADDPat; +def : SHLToADDPat; +def : SHLToADDPat; +def : SHLToADDPat; +def : SHLToADDPat; + defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>; diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll index e180afa..475affa 100644 --- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll @@ -18,7 +18,7 @@ define void @testLeftBad8x8(<8 x i8> %src1, <8 x i8> %src2, ptr %dest) nounwind ; CHECK-LABEL: testLeftBad8x8: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.8b v2, #165 -; CHECK-NEXT: shl.8b v1, v1, #1 +; CHECK-NEXT: add.8b v1, v1, v1 ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] @@ -76,7 +76,7 @@ define void @testLeftBad16x8(<16 x i8> %src1, <16 x i8> %src2, ptr %dest) nounwi ; CHECK-LABEL: testLeftBad16x8: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.16b v2, #165 -; CHECK-NEXT: shl.16b v1, v1, #1 +; CHECK-NEXT: add.16b v1, v1, v1 ; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll index 69d47a6..47058fe 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -2106,7 +2106,7 @@ define <8 x i16> @neon.ushl8_noext_constant_shift(ptr %A) nounwind { ; CHECK-LABEL: neon.ushl8_noext_constant_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shl.8h v0, v0, #1 +; CHECK-NEXT: add.8h v0, v0, v0 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp1, <8 x i16> ) @@ -2146,7 +2146,7 @@ define <4 x i32> @neon.ushll4s_constant_fold() nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI160_0 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI160_0] -; CHECK-NEXT: shl.4s v0, v0, #1 +; CHECK-NEXT: add.4s v0, v0, v0 ; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> , <4 x i32> ) ret <4 x i32> %tmp3 @@ -2227,7 +2227,7 @@ define <16 x i8> @neon.sshl16b_constant_shift(ptr %A) nounwind { ; CHECK-LABEL: neon.sshl16b_constant_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shl.16b v0, v0, #1 +; CHECK-NEXT: add.16b v0, v0, v0 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) @@ -2326,7 +2326,7 @@ define <4 x i32> @neon.sshl4s_no_fold(ptr %A) nounwind { ; CHECK-LABEL: neon.sshl4s_no_fold: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shl.4s v0, v0, #1 +; CHECK-NEXT: add.4s v0, v0, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) @@ -2395,7 +2395,7 @@ define <2 x i64> @neon.sshl2d_constant_fold() nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI179_0 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI179_0] -; CHECK-NEXT: shl.2d v0, v0, #1 +; CHECK-NEXT: add.2d v0, v0, v0 ; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> , <2 x i64> ) ret <2 x i64> %tmp3 @@ -3191,7 +3191,7 @@ define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: shl.8b v0, v0, #1 +; CHECK-NEXT: add.8b v0, v0, v0 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A @@ -3206,7 +3206,7 @@ define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: shl.4h v0, v0, #1 +; CHECK-NEXT: add.4h v0, v0, v0 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A @@ -3221,7 +3221,7 @@ define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: shl.2s v0, v0, #1 +; CHECK-NEXT: add.2s v0, v0, v0 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A @@ -3236,7 +3236,7 @@ define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shl.16b v0, v0, #1 +; CHECK-NEXT: add.16b v0, v0, v0 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A @@ -3251,7 +3251,7 @@ define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shl.8h v0, v0, #1 +; CHECK-NEXT: add.8h v0, v0, v0 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A @@ -3266,7 +3266,7 @@ define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shl.4s v0, v0, #1 +; CHECK-NEXT: add.4s v0, v0, v0 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A @@ -3281,7 +3281,7 @@ define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shl.2d v0, v0, #1 +; CHECK-NEXT: add.2d v0, v0, v0 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll index a90007f..414fa07 100644 --- a/llvm/test/CodeGen/AArch64/rax1.ll +++ b/llvm/test/CodeGen/AArch64/rax1.ll @@ -11,7 +11,7 @@ define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) { ; NOSHA3-LABEL: rax1: ; NOSHA3: // %bb.0: ; NOSHA3-NEXT: ushr v2.2d, v1.2d, #63 -; NOSHA3-NEXT: shl v1.2d, v1.2d, #1 +; NOSHA3-NEXT: add v1.2d, v1.2d, v1.2d ; NOSHA3-NEXT: orr v1.16b, v1.16b, v2.16b ; NOSHA3-NEXT: eor v0.16b, v0.16b, v1.16b ; NOSHA3-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shl-to-add.ll b/llvm/test/CodeGen/AArch64/shl-to-add.ll new file mode 100644 index 0000000..de24998 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shl-to-add.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o - | FileCheck %s + +define <16 x i8> @shl_v16i8(<16 x i8> %a) { +; CHECK-LABEL: shl_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ret +entry: + %add.i = shl <16 x i8> %a, + ret <16 x i8> %add.i +} + +define <8 x i16> @shl_v8i16(<8 x i16> %a) { +; CHECK-LABEL: shl_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.8h, v0.8h, v0.8h +; CHECK-NEXT: ret +entry: + %add.i = shl <8 x i16> %a, + ret <8 x i16> %add.i +} + +define <4 x i32> @shl_v4i32(<4 x i32> %a) { +; CHECK-LABEL: shl_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-NEXT: ret +entry: + %add.i = shl <4 x i32> %a, + ret <4 x i32> %add.i +} + +define <2 x i64> @shl_v2i64(<2 x i64> %a) { +; CHECK-LABEL: shl_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %add.i = shl <2 x i64> %a, + ret <2 x i64> %add.i +} + +define <8 x i8> @shl_v8i8(<8 x i8> %a) { +; CHECK-LABEL: shl_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.8b, v0.8b, v0.8b +; CHECK-NEXT: ret +entry: + %add.i = shl <8 x i8> %a, + ret <8 x i8> %add.i +} + +define <4 x i16> @shl_v4i16(<4 x i16> %a) { +; CHECK-LABEL: shl_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %add.i = shl <4 x i16> %a, + ret <4 x i16> %add.i +} + +define <2 x i32> @shl_v2i32(<2 x i32> %a) { +; CHECK-LABEL: shl_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret +entry: + %add.i = shl <2 x i32> %a, + ret <2 x i32> %add.i +} + +define <8 x i16> @sshll_v8i8(<8 x i8> %a) { +; CHECK-LABEL: sshll_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v0.8h, v0.8b, #1 +; CHECK-NEXT: ret + %1 = sext <8 x i8> %a to <8 x i16> + %tmp = shl <8 x i16> %1, + ret <8 x i16> %tmp +} + +define <4 x i32> @sshll_v4i16(<4 x i16> %a) { +; CHECK-LABEL: sshll_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v0.4s, v0.4h, #1 +; CHECK-NEXT: ret + %1 = sext <4 x i16> %a to <4 x i32> + %tmp = shl <4 x i32> %1, + ret <4 x i32> %tmp +} + +define <2 x i64> @sshll_v2i32(<2 x i32> %a) { +; CHECK-LABEL: sshll_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v0.2d, v0.2s, #1 +; CHECK-NEXT: ret + %1 = sext <2 x i32> %a to <2 x i64> + %tmp = shl <2 x i64> %1, + ret <2 x i64> %tmp +} + +define <8 x i16> @ushll_v8i8(<8 x i8> %a) { +; CHECK-LABEL: ushll_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.8h, v0.8b, #1 +; CHECK-NEXT: ret + %1 = zext <8 x i8> %a to <8 x i16> + %tmp = shl <8 x i16> %1, + ret <8 x i16> %tmp +} + +define <4 x i32> @ushll_v4i16(<4 x i16> %a) { +; CHECK-LABEL: ushll_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #1 +; CHECK-NEXT: ret + %1 = zext <4 x i16> %a to <4 x i32> + %tmp = shl <4 x i32> %1, + ret <4 x i32> %tmp +} + +define <2 x i64> @ushll_v2i32(<2 x i32> %a) { +; CHECK-LABEL: ushll_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-NEXT: ret + %1 = zext <2 x i32> %a to <2 x i64> + %tmp = shl <2 x i64> %1, + ret <2 x i64> %tmp +} diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll index 013feb6..029bf83 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll @@ -78,7 +78,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-NEXT: mul v0.4h, v0.4h, v2.4h ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2] ; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: shl v3.4h, v0.4h, #1 +; CHECK-NEXT: add v3.4h, v0.4h, v0.4h ; CHECK-NEXT: bic v0.4h, #248, lsl #8 ; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ushl v1.4h, v3.4h, v2.4h diff --git a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll index d286c32..7e958b2 100644 --- a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll @@ -214,7 +214,7 @@ define <16 x i8> @test_128_i8_x_16_7_mask_shl_1(<16 x i8> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #7 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.16b, v0.16b, #1 +; CHECK-NEXT: add v0.16b, v0.16b, v0.16b ; CHECK-NEXT: ret %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, @@ -255,7 +255,7 @@ define <16 x i8> @test_128_i8_x_16_28_mask_shl_1(<16 x i8> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #28 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.16b, v0.16b, #1 +; CHECK-NEXT: add v0.16b, v0.16b, v0.16b ; CHECK-NEXT: ret %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, @@ -300,7 +300,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_shl_1(<16 x i8> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #224 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.16b, v0.16b, #1 +; CHECK-NEXT: add v0.16b, v0.16b, v0.16b ; CHECK-NEXT: ret %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, @@ -526,7 +526,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_shl_1(<8 x i16> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #127 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.8h, v0.8h, #1 +; CHECK-NEXT: add v0.8h, v0.8h, v0.8h ; CHECK-NEXT: ret %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, @@ -616,7 +616,7 @@ define <8 x i16> @test_128_i16_x_8_65024_mask_shl_1(<8 x i16> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #254, lsl #8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.8h, v0.8h, #1 +; CHECK-NEXT: add v0.8h, v0.8h, v0.8h ; CHECK-NEXT: ret %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, @@ -842,7 +842,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_shl_1(<4 x i32> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #127, msl #8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.4s, v0.4s, #1 +; CHECK-NEXT: add v0.4s, v0.4s, v0.4s ; CHECK-NEXT: ret %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, @@ -932,7 +932,7 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) { ; CHECK: // %bb.0: ; CHECK-NEXT: mvni v1.4s, #1, msl #16 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.4s, v0.4s, #1 +; CHECK-NEXT: add v0.4s, v0.4s, v0.4s ; CHECK-NEXT: ret %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, @@ -1165,7 +1165,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) { ; CHECK-NEXT: mov w8, #2147483647 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.2d, v0.2d, #1 +; CHECK-NEXT: add v0.2d, v0.2d, v0.2d ; CHECK-NEXT: ret %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, @@ -1257,7 +1257,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0 ; CHECK-NEXT: mov x8, #-8589934592 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.2d, v0.2d, #1 +; CHECK-NEXT: add v0.2d, v0.2d, v0.2d ; CHECK-NEXT: ret %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, -- 2.7.4