From d33910a8cc838587886a7302e7f8e6761bb5a89c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 22 May 2023 13:37:19 +0000 Subject: [PATCH] [AArch64][SME2/SVE2p1] Add predicate-as-counter intrinsics for sel These intrinsics are used to implement the sel intrinsics that selects a tuple of 2 or 4 values based on a predicate-as-counter operand, e.g. __attribute__((arm_streaming)) svuint8x2_t svsel[_u8_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm); __attribute__((arm_streaming)) svuint8x4_t svsel[_u8_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm); As described in https://github.com/ARM-software/acle/pull/217 Reviewed By: CarolineConcatto Differential Revision: https://reviews.llvm.org/D150951 --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 20 ++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 35 +++- llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +- .../CodeGen/AArch64/sve2p1-intrinsics-selx2.ll | 174 +++++++++++++++++ .../CodeGen/AArch64/sve2p1-intrinsics-selx4.ll | 215 +++++++++++++++++++++ 5 files changed, 439 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 51b98cc..d0e354b 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2926,6 +2926,21 @@ let TargetPrefix = "aarch64" in { LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SVE2_VG2_Sel_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], [IntrNoMem]>; + + class SVE2_VG4_Sel_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], [IntrNoMem]>; + class SME2_CVT_VG2_SINGLE_Intrinsic : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty, LLVMMatchType<0>], @@ -3385,4 +3400,9 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic; def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic; def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic; + + // 2-way and 4-way vector selects + def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic; + def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic; + } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 5f54b87..e83a386 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -371,7 +371,8 @@ public: unsigned Opc_rr, unsigned Opc_ri, bool IsIntr = false); void SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, - bool IsZmMulti, unsigned Opcode); + bool IsZmMulti, unsigned Opcode, + bool HasPred = false); void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); @@ -1709,11 +1710,13 @@ void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, - unsigned Opcode) { + unsigned Opcode, + bool HasPred) { assert(Opcode != 0 && "Unexpected opcode"); SDLoc DL(N); EVT VT = N->getValueType(0); + unsigned FirstVecIdx = HasPred ? 2 : 1; auto GetMultiVecOperand = [=](unsigned StartIdx) { SmallVector Regs(N->op_begin() + StartIdx, @@ -1721,16 +1724,20 @@ void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, return createZMulTuple(Regs); }; - SDValue Zdn = GetMultiVecOperand(1); + SDValue Zdn = GetMultiVecOperand(FirstVecIdx); SDValue Zm; if (IsZmMulti) - Zm = GetMultiVecOperand(NumVecs + 1); + Zm = GetMultiVecOperand(NumVecs + FirstVecIdx); else - Zm = N->getOperand(NumVecs + 1); - - SDNode *Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Zdn, Zm); + Zm = N->getOperand(NumVecs + FirstVecIdx); + SDNode *Intrinsic; + if (HasPred) + Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, + N->getOperand(1), Zdn, Zm); + else + Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Zdn, Zm); SDValue SuperReg = SDValue(Intrinsic, 0); for (unsigned i = 0; i < NumVecs; ++i) ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( @@ -5330,6 +5337,20 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, AArch64::UZP_VG4_4Z4Z_Q); return; + case Intrinsic::aarch64_sve_sel_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::SEL_VG2_2ZC2Z2Z_B, AArch64::SEL_VG2_2ZC2Z2Z_H, + AArch64::SEL_VG2_2ZC2Z2Z_S, AArch64::SEL_VG2_2ZC2Z2Z_D})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op, /*HasPred=*/true); + return; + case Intrinsic::aarch64_sve_sel_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::SEL_VG4_4ZC4Z4Z_B, AArch64::SEL_VG4_4ZC4Z4Z_H, + AArch64::SEL_VG4_4ZC4Z4Z_S, AArch64::SEL_VG4_4ZC4Z4Z_D})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op, /*HasPred=*/true); + return; case Intrinsic::aarch64_sve_frinta_x2: SelectFrintFromVT(Node, 2, AArch64::FRINTA_2Z2Z_S); return; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 0696d0f..1c2905f 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -632,8 +632,8 @@ defm SQRSHRN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"sqrshrn", 0b100, int_aarch64_ defm UQRSHRN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"uqrshrn", 0b101, int_aarch64_sve_uqrshrn_x4>; defm SQRSHRUN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"sqrshrun", 0b110, int_aarch64_sve_sqrshrun_x4>; -defm SEL_VG2_2ZP2Z2Z: sme2_sel_vector_vg2<"sel">; -defm SEL_VG4_4ZP4Z4Z: sme2_sel_vector_vg4<"sel">; +defm SEL_VG2_2ZC2Z2Z: sme2_sel_vector_vg2<"sel">; +defm SEL_VG4_4ZC4Z4Z: sme2_sel_vector_vg4<"sel">; def LD1B_VG2_M2ZPXX : sme2_ld_vector_vg2_multi_scalar_scalar<0b00, 0b0, ZZ_b_strided, GPR64shifted8, "ld1b">; def LD1B_VG4_M4ZPXX : sme2_ld_vector_vg4_multi_scalar_scalar<0b00, 0b0, ZZZZ_b_strided, GPR64shifted8, "ld1b">; diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll new file mode 100644 index 0000000..8bae00d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , } @sel_x2_i8(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_i16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_f16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_bf16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_i32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_f32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_i64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_f64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +; == 8 to 64-bit elements == +declare { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll new file mode 100644 index 0000000..5505aea --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , , , } @sel_x4_i8(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1b { z27.b }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.b - z3.b }, pn8, { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_i16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1h { z27.h }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_f16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1h { z27.h }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_bf16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1h { z27.h }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_i32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1w { z27.s }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_f32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1w { z27.s }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_i64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1d { z27.d }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_f64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1d { z27.d }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + + +; == 8 to 64-bit elements == +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) -- 2.7.4