[AArch64][CodeGen] Always use SVE (when enabled) to lower 64-bit vector multiplies

author David Sherwood <david.sherwood@arm.com>

Wed, 2 Feb 2022 09:02:16 +0000 (09:02 +0000)

committer David Sherwood <david.sherwood@arm.com>

Tue, 8 Feb 2022 15:37:52 +0000 (15:37 +0000)
author David Sherwood <david.sherwood@arm.com>
Wed, 2 Feb 2022 09:02:16 +0000 (09:02 +0000)
committer David Sherwood <david.sherwood@arm.com>
Tue, 8 Feb 2022 15:37:52 +0000 (15:37 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index bf97741..34d13ec 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1341,6 +1341,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
        setOperationAction(ISD::UDIV, VT, Custom);
      }
  
+    // NEON doesn't support 64-bit vector integer muls, but SVE does.
+    setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
      // NOTE: Currently this has to happen after computeRegisterProperties rather
      // than the preferred option of combining it with the addRegisterClass call.
      if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1367,8 +1371,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
        setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
        setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
        setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
-      setOperationAction(ISD::MUL, MVT::v1i64, Custom);
-      setOperationAction(ISD::MUL, MVT::v2i64, Custom);
        setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
        setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
        setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
@@ -3950,9 +3952,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    // If SVE is available then i64 vector multiplications can also be made legal.
    bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
  
-  if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(
-          VT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors()))
+  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
      return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
  
    // Multiplications are only custom-lowered for 128-bit vectors so that
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll

index a34c440..d54c3a9 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=VBITS_EQ_128
  ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
  ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
  ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
@@ -22,9 +22,6 @@
  
  target triple = "aarch64-unknown-linux-gnu"
  
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
  ;
  ; ADD
  ;
@@ -657,22 +654,32 @@ define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
    ret void
  }
  
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
  define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
  ; CHECK-LABEL: mul_v1i64:
  ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
  ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
  ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v1i64:
+; VBITS_EQ_128:         ptrue p0.d, vl1
+; VBITS_EQ_128:         mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128:         ret
+
    %res = mul <1 x i64> %op1, %op2
    ret <1 x i64> %res
  }
  
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
  define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
  ; CHECK-LABEL: mul_v2i64:
  ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
  ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
  ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v2i64:
+; VBITS_EQ_128:         ptrue p0.d, vl2
+; VBITS_EQ_128:         mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128:         ret
+
    %res = mul <2 x i64> %op1, %op2
    ret <2 x i64> %res
  }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

index 42e1036..710575a 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=VBITS_EQ_128
  ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
  ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
  ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
@@ -25,14 +25,12 @@
  
  target triple = "aarch64-unknown-linux-gnu"
  
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
  ;
  ; SMULH
  ;
  
  ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
  define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
  ; CHECK-LABEL: smulh_v8i8:
  ; CHECK:       // %bb.0:
@@ -166,6 +164,7 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
  }
  
  ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
  define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
  ; CHECK-LABEL: smulh_v4i16:
  ; CHECK:       // %bb.0:
@@ -294,6 +293,15 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
  ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
  ; CHECK-NEXT:    ret
+
+; VBITS_EQ_128-LABEL: smulh_v2i32:
+; VBITS_EQ_128:         sshll v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT:    sshll v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT:    ret
+
    %1 = sext <2 x i32> %op1 to <2 x i64>
    %2 = sext <2 x i32> %op2 to <2 x i64>
    %mul = mul <2 x i64> %1, %2
@@ -521,6 +529,7 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
  ;
  
  ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
  define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
  ; CHECK-LABEL: umulh_v8i8:
  ; CHECK:       // %bb.0:
@@ -652,6 +661,7 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
  }
  
  ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
  define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
  ; CHECK-LABEL: umulh_v4i16:
  ; CHECK:       // %bb.0:
@@ -780,6 +790,15 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
  ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
  ; CHECK-NEXT:    ret
+
+; VBITS_EQ_128-LABEL: umulh_v2i32:
+; VBITS_EQ_128:         ushll   v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT:    ushll   v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT:    ptrue   p0.d, vl2
+; VBITS_EQ_128-NEXT:    mul     z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    shrn    v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT:    ret
+
    %1 = zext <2 x i32> %op1 to <2 x i64>
    %2 = zext <2 x i32> %op2 to <2 x i64>
    %mul = mul <2 x i64> %1, %2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll

index c86e4f1..a3aeed9 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -697,10 +697,7 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
  ; VBITS_EQ_128:         ptrue p0.d, vl1
  ; VBITS_EQ_128-NEXT:    movprfx z2, z0
  ; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x8, d2
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    mul x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d1, x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
  ; VBITS_EQ_128-NEXT:    sub d0, d0, d1
  ; VBITS_EQ_128-NEXT:    ret
  
@@ -723,14 +720,7 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
  ; VBITS_EQ_128:         ptrue p0.d, vl2
  ; VBITS_EQ_128-NEXT:    movprfx z2, z0
  ; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x9, d2
-; VBITS_EQ_128-NEXT:    fmov x10, d1
-; VBITS_EQ_128-NEXT:    mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT:    mul x9, x9, x10
-; VBITS_EQ_128-NEXT:    mul x8, x8, x11
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    mov v1.d[1], x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
  ; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
  ; VBITS_EQ_128-NEXT:    ret
  
@@ -1487,10 +1477,7 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
  ; VBITS_EQ_128:         ptrue p0.d, vl1
  ; VBITS_EQ_128-NEXT:    movprfx z2, z0
  ; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x8, d2
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    mul x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d1, x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
  ; VBITS_EQ_128-NEXT:    sub d0, d0, d1
  ; VBITS_EQ_128-NEXT:    ret
  
@@ -1513,14 +1500,7 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
  ; VBITS_EQ_128:         ptrue p0.d, vl2
  ; VBITS_EQ_128-NEXT:    movprfx z2, z0
  ; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x9, d2
-; VBITS_EQ_128-NEXT:    fmov x10, d1
-; VBITS_EQ_128-NEXT:    mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT:    mul x9, x9, x10
-; VBITS_EQ_128-NEXT:    mul x8, x8, x11
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    mov v1.d[1], x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
  ; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
  ; VBITS_EQ_128-NEXT:    ret
author	David Sherwood <david.sherwood@arm.com>
	Wed, 2 Feb 2022 09:02:16 +0000 (09:02 +0000)
committer	David Sherwood <david.sherwood@arm.com>
	Tue, 8 Feb 2022 15:37:52 +0000 (15:37 +0000)
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll		patch \| blob \| history