From: David Green Date: Wed, 26 Apr 2023 21:12:00 +0000 (+0100) Subject: [AArch64][SVE] Generate smull/umull instead of sve v2i64 mul X-Git-Tag: upstream/17.0.6~10294 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d340ef697d905f81ede9747cb64160177bc0c53c;p=platform%2Fupstream%2Fllvm.git [AArch64][SVE] Generate smull/umull instead of sve v2i64 mul A neon smull/umull should be preferred over a sve v2i64 mul with two extends. It will be both less instructions and a lower cost multiply instruction. Differential Revision: https://reviews.llvm.org/D148248 --- diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a5c19eb..1a7adf6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4534,8 +4534,8 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // If SVE is available then i64 vector multiplications can also be made legal. - bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 || - Subtarget->forceStreamingCompatibleSVE(); + bool OverrideNEON = + VT == MVT::v1i64 || Subtarget->forceStreamingCompatibleSVE(); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); @@ -4551,10 +4551,14 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA); if (!NewOpc) { - if (VT == MVT::v2i64) + if (VT == MVT::v2i64) { + // If SVE is available then i64 vector multiplications can also be made + // legal. + if (Subtarget->hasSVE()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Fall through to expand this. It is not legal. return SDValue(); - else + } else // Other vector multiplications are legal. return Op; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index da0e428..e4d733f 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull_v8i8_v8i16: @@ -119,19 +120,32 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { } define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: smull_zext_v2i32_v2i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w11, [x0, #2] -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: smull x8, w8, w9 -; CHECK-NEXT: smull x9, w11, w10 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x9 -; CHECK-NEXT: ret +; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: +; CHECK-NEON: // %bb.0: +; CHECK-NEON-NEXT: ldr d0, [x1] +; CHECK-NEON-NEXT: ldrh w8, [x0] +; CHECK-NEON-NEXT: ldrh w11, [x0, #2] +; CHECK-NEON-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEON-NEXT: fmov x9, d0 +; CHECK-NEON-NEXT: mov x10, v0.d[1] +; CHECK-NEON-NEXT: smull x8, w8, w9 +; CHECK-NEON-NEXT: smull x9, w11, w10 +; CHECK-NEON-NEXT: fmov d0, x8 +; CHECK-NEON-NEXT: mov v0.d[1], x9 +; CHECK-NEON-NEXT: ret +; +; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: +; CHECK-SVE: // %bb.0: +; CHECK-SVE-NEXT: ldrh w8, [x0] +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: ldr d1, [x1] +; CHECK-SVE-NEXT: fmov d0, x8 +; CHECK-SVE-NEXT: ldrh w8, [x0, #2] +; CHECK-SVE-NEXT: sshll v1.2d, v1.2s, #0 +; CHECK-SVE-NEXT: mov v0.d[1], x8 +; CHECK-SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret %load.A = load <2 x i16>, ptr %A %load.B = load <2 x i32>, ptr %B %zext.A = zext <2 x i16> %load.A to <2 x i64> @@ -611,7 +625,7 @@ define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; Do not use SMULL if the BUILD_VECTOR element values are too big. ; CHECK-LABEL: smull_noextvec_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #64537 +; CHECK-NEXT: mov w8, #64537 // =0xfc19 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: dup v1.8h, w8 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h @@ -635,7 +649,7 @@ define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-LABEL: smull_extvec_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1234 +; CHECK-NEXT: mov w8, #-1234 // =0xfffffb2e ; CHECK-NEXT: dup v1.2s, w8 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret @@ -659,7 +673,7 @@ define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; Do not use SMULL if the BUILD_VECTOR element values are too big. ; CHECK-LABEL: umull_noextvec_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #999 +; CHECK-NEXT: mov w8, #999 // =0x3e7 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: dup v1.8h, w8 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h @@ -672,7 +686,7 @@ define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-LABEL: umull_extvec_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1234 +; CHECK-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEXT: dup v1.4h, w8 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret @@ -684,7 +698,7 @@ define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-LABEL: umull_extvec_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1234 +; CHECK-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEXT: dup v1.2s, w8 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret @@ -709,7 +723,7 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-LABEL: amull_extvec_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1234 +; CHECK-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEXT: dup v2.4h, w8 ; CHECK-NEXT: smull v0.4s, v0.4h, v2.4h @@ -724,7 +738,7 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-LABEL: amull_extvec_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1234 +; CHECK-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEXT: dup v2.2s, w8 ; CHECK-NEXT: smull v0.2d, v0.2s, v2.2s