(STLXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>;
def : Pat<(int_aarch64_stxp GPR64:$lo, GPR64:$hi, GPR64:$addr),
(STXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>;
+
+multiclass SIMDAcrossLanesSignedIntrinsicBHS<string baseOpc, Intrinsic intOp> {
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ (i64 0)))>;
+ def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ (i64 0)))>;
+
+ def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ (i64 0)))>;
+ def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ (i64 0)))>;
+
+ def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+ ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsicBHS<string baseOpc,
+ Intrinsic intOp> {
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ ssub))>;
+ def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ ssub))>;
+
+ def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ ssub))>;
+ def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ ssub))>;
+
+ def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+ ssub))>;
+}
+
+
+defm : SIMDAcrossLanesSignedIntrinsicBHS<"ADDV", int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (ADDPv2i32 V64:$Rn, V64:$Rn), dsub),
+ ssub))>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"ADDV", int_aarch64_neon_uaddv>;
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (ADDPv2i32 V64:$Rn, V64:$Rn), dsub),
+ ssub))>;
+
+defm : SIMDAcrossLanesSignedIntrinsicBHS<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (SMAXPv2i32 V64:$Rn, V64:$Rn), dsub),
+ ssub))>;
+
+defm : SIMDAcrossLanesSignedIntrinsicBHS<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (SMINPv2i32 V64:$Rn, V64:$Rn), dsub),
+ ssub))>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (UMAXPv2i32 V64:$Rn, V64:$Rn), dsub),
+ ssub))>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (UMINPv2i32 V64:$Rn, V64:$Rn), dsub),
+ ssub))>;
defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
// Floating-point
-defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", untyped, load>;
+defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", i8, load>;
defm LDRH : Load16RO<0b01, 1, 0b01, FPR16Op, "ldr", f16, load>;
defm LDRS : Load32RO<0b10, 1, 0b01, FPR32Op, "ldr", f32, load>;
defm LDRD : Load64RO<0b11, 1, 0b01, FPR64Op, "ldr", f64, load>;
// Floating-point
-defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", untyped, store>;
+defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", i8, store>;
defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>;
defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>;
defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>;
// (immediate pre-indexed)
def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str", pre_store, i32>;
def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str", pre_store, i64>;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, untyped>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, i8>;
def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str", pre_store, f16>;
def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str", pre_store, f32>;
def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str", pre_store, f64>;
// (immediate post-indexed)
def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z, "str", post_store, i32>;
def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z, "str", post_store, i64>;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, untyped>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, i8>;
def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op, "str", post_store, f16>;
def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op, "str", post_store, f32>;
def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op, "str", post_store, f64>;
def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
}
-def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> {
let Size = 8;
}
def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> {
let Predicates = [NotInStreamingSVEMode] in {
def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8),
(i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
- def : Pat<(sext_inreg (anyext (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), i8),
+ def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index))), i8),
(i64 (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16),
(i32 (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
- def : Pat<(sext_inreg (anyext (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), i16),
+ def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index))), i16),
(i64 (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
- def : Pat<(sext (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+ def : Pat<(sext (i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index))),
(i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
} // End NotInStreamingSVEMode
getValueMapping(RBIdx, Size), NumOperands);
}
-/// \returns true if a given intrinsic \p ID only uses and defines FPRs.
-static bool isFPIntrinsic(unsigned ID) {
+/// \returns true if a given intrinsic only uses and defines FPRs.
+static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC);
// TODO: Add more intrinsics.
- switch (ID) {
+ switch (MI.getIntrinsicID()) {
default:
return false;
case Intrinsic::aarch64_neon_uaddlv:
+ case Intrinsic::aarch64_neon_uaddv:
+ case Intrinsic::aarch64_neon_umaxv:
+ case Intrinsic::aarch64_neon_uminv:
+ case Intrinsic::aarch64_neon_fmaxv:
+ case Intrinsic::aarch64_neon_fminv:
+ case Intrinsic::aarch64_neon_fmaxnmv:
+ case Intrinsic::aarch64_neon_fminnmv:
return true;
+ case Intrinsic::aarch64_neon_saddlv: {
+ const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
+ return SrcTy.getElementType().getSizeInBits() >= 16 &&
+ SrcTy.getElementCount().getFixedValue() >= 4;
+ }
+ case Intrinsic::aarch64_neon_saddv:
+ case Intrinsic::aarch64_neon_smaxv:
+ case Intrinsic::aarch64_neon_sminv: {
+ const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
+ return SrcTy.getElementType().getSizeInBits() >= 32 &&
+ SrcTy.getElementCount().getFixedValue() >= 2;
+ }
}
}
const TargetRegisterInfo &TRI,
unsigned Depth) const {
unsigned Op = MI.getOpcode();
- if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MI.getIntrinsicID()))
+ if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MRI, MI))
return true;
// Do we have an explicit floating point instruction?
case TargetOpcode::G_INTRINSIC: {
// Check if we know that the intrinsic has any constraints on its register
// banks. If it does, then update the mapping accordingly.
- unsigned ID = MI.getIntrinsicID();
unsigned Idx = 0;
- if (!isFPIntrinsic(ID))
+ if (!isFPIntrinsic(MRI, MI))
break;
for (const auto &Op : MI.explicit_operands()) {
if (Op.isReg())
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
+; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64-eabi -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
; Function Attrs: nounwind readnone
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+; GISEL-NOT: Instruction selection used fallback path for add_B
+; GISEL-NOT: Instruction selection used fallback path for add_H
+; GISEL-NOT: Instruction selection used fallback path for add_S
+; GISEL-NOT: Instruction selection used fallback path for add_D
+; GISEL-NOT: Instruction selection used fallback path for oversized_ADDV_512
+; GISEL-NOT: Instruction selection used fallback path for addv_combine_i32
+; GISEL-NOT: Instruction selection used fallback path for addv_combine_i64
+
define i8 @add_B(ptr %arr) {
; CHECK-LABEL: add_B:
; CHECK: // %bb.0:
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
define i32 @oversized_ADDV_512(ptr %arr) {
-; CHECK-LABEL: oversized_ADDV_512:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q1, [x0, #32]
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: oversized_ADDV_512:
+; SDAG: // %bb.0:
+; SDAG-NEXT: ldp q0, q1, [x0, #32]
+; SDAG-NEXT: ldp q3, q2, [x0]
+; SDAG-NEXT: add v0.4s, v3.4s, v0.4s
+; SDAG-NEXT: add v1.4s, v2.4s, v1.4s
+; SDAG-NEXT: add v0.4s, v0.4s, v1.4s
+; SDAG-NEXT: addv s0, v0.4s
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: oversized_ADDV_512:
+; GISEL: // %bb.0:
+; GISEL-NEXT: ldp q0, q1, [x0]
+; GISEL-NEXT: ldp q2, q3, [x0, #32]
+; GISEL-NEXT: add v0.4s, v0.4s, v1.4s
+; GISEL-NEXT: add v1.4s, v2.4s, v3.4s
+; GISEL-NEXT: add v0.4s, v0.4s, v1.4s
+; GISEL-NEXT: addv s0, v0.4s
+; GISEL-NEXT: fmov w0, s0
+; GISEL-NEXT: ret
%bin.rdx = load <16 x i32>, ptr %arr
%r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
ret i32 %r
}
define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) {
-; CHECK-LABEL: addv_combine_i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: addv_combine_i32:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: add v0.4s, v0.4s, v1.4s
+; SDAG-NEXT: addv s0, v0.4s
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: addv_combine_i32:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: addv s0, v0.4s
+; GISEL-NEXT: addv s1, v1.4s
+; GISEL-NEXT: fmov w8, s0
+; GISEL-NEXT: fmov w9, s1
+; GISEL-NEXT: add w0, w8, w9
+; GISEL-NEXT: ret
entry:
%rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1)
%rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2)
}
define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) {
-; CHECK-LABEL: addv_combine_i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; SDAG-LABEL: addv_combine_i64:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: add v0.2d, v0.2d, v1.2d
+; SDAG-NEXT: addp d0, v0.2d
+; SDAG-NEXT: fmov x0, d0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: addv_combine_i64:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: addp d0, v0.2d
+; GISEL-NEXT: addp d1, v1.2d
+; GISEL-NEXT: fmov x8, d0
+; GISEL-NEXT: fmov x9, d1
+; GISEL-NEXT: add x0, x8, x9
+; GISEL-NEXT: ret
entry:
%rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1)
%rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2)
; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
define float @test_fminv_v2f32(<2 x float> %in) {
; CHECK: test_fminv_v2f32:
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -global-isel=1 -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)