setOperationAction(ISD::UDIV, VT, Custom);
}
+ // NEON doesn't support 64-bit vector integer muls, but SVE does.
+ setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {
setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
- setOperationAction(ISD::MUL, MVT::v1i64, Custom);
- setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
// If SVE is available then i64 vector multiplications can also be made legal.
bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
- if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(
- VT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors()))
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
// Multiplications are only custom-lowered for 128-bit vectors so that
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; ADD
;
ret void
}
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; CHECK-LABEL: mul_v1i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v1i64:
+; VBITS_EQ_128: ptrue p0.d, vl1
+; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128: ret
+
%res = mul <1 x i64> %op1, %op2
ret <1 x i64> %res
}
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; CHECK-LABEL: mul_v2i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v2i64:
+; VBITS_EQ_128: ptrue p0.d, vl2
+; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128: ret
+
%res = mul <2 x i64> %op1, %op2
ret <2 x i64> %res
}
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; SMULH
;
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-LABEL: smulh_v8i8:
; CHECK: // %bb.0:
}
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: smulh_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: smulh_v2i32:
+; VBITS_EQ_128: sshll v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT: ret
+
%1 = sext <2 x i32> %op1 to <2 x i64>
%2 = sext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
;
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-LABEL: umulh_v8i8:
; CHECK: // %bb.0:
}
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: umulh_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: umulh_v2i32:
+; VBITS_EQ_128: ushll v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT: ret
+
%1 = zext <2 x i32> %op1 to <2 x i64>
%2 = zext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
; VBITS_EQ_128: ptrue p0.d, vl1
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x8, d2
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: mul x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d1, x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub d0, d0, d1
; VBITS_EQ_128-NEXT: ret
; VBITS_EQ_128: ptrue p0.d, vl2
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x9, d2
-; VBITS_EQ_128-NEXT: fmov x10, d1
-; VBITS_EQ_128-NEXT: mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT: mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT: mul x9, x9, x10
-; VBITS_EQ_128-NEXT: mul x8, x8, x11
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: mov v1.d[1], x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d
; VBITS_EQ_128-NEXT: ret
; VBITS_EQ_128: ptrue p0.d, vl1
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x8, d2
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: mul x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d1, x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub d0, d0, d1
; VBITS_EQ_128-NEXT: ret
; VBITS_EQ_128: ptrue p0.d, vl2
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x9, d2
-; VBITS_EQ_128-NEXT: fmov x10, d1
-; VBITS_EQ_128-NEXT: mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT: mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT: mul x9, x9, x10
-; VBITS_EQ_128-NEXT: mul x8, x8, x11
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: mov v1.d[1], x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d
; VBITS_EQ_128-NEXT: ret