From: Bradley Smith Date: Fri, 30 Apr 2021 15:17:37 +0000 (+0100) Subject: [AArch64][SVE] Better utilisation of unpredicated forms of arithmetic intrinsics X-Git-Tag: llvmorg-14-init~7185 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f8f953c2a6b3ee6bd50f1bc1bc81880c0d40eb6c;p=platform%2Fupstream%2Fllvm.git [AArch64][SVE] Better utilisation of unpredicated forms of arithmetic intrinsics When using predicated arithmetic intrinsics, if the predicate used is all lanes active, use an unpredicated form of the instruction, additionally this allows for better use of immediate forms. This also includes a new complex isel pattern which allows matching an all active predicate when the types are different but the predicate is a superset of the type being used. For example, to allow a b8 ptrue for a b32 predicate operand. This only includes instructions where the unpredicated/predicated forms are mismatched between variants, meaning that the removal of the predicate is done during instruction selection in order to prevent spurious re-introductions of ptrue instructions. Co-authored-by: Paul Walker Differential Revision: https://reviews.llvm.org/D101062 --- diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 3d8aa29..d7c1e32 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -335,6 +335,8 @@ private: bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); + + bool SelectAllActivePredicate(SDValue N); }; } // end anonymous namespace @@ -4983,3 +4985,10 @@ bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale, return false; } + +bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) { + const AArch64TargetLowering *TLI = + static_cast(getTargetLowering()); + + return TLI->isAllActivePredicate(N); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6b1105d..3a71405 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13754,6 +13754,28 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } +static bool isAllActivePredicate(SDValue N) { + unsigned NumElts = N.getValueType().getVectorMinNumElements(); + + // Look through cast. + while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) { + N = N.getOperand(0); + // When reinterpreting from a type with fewer elements the "new" elements + // are not active, so bail if they're likely to be used. + if (N.getValueType().getVectorMinNumElements() < NumElts) + return false; + } + + // "ptrue p., all" can be considered all active when is the same size + // or smaller than the implicit element type represented by N. + // NOTE: A larger element count implies a smaller element type. + if (N.getOpcode() == AArch64ISD::PTRUE && + N.getConstantOperandVal(0) == AArch64SVEPredPattern::all) + return N.getValueType().getVectorMinNumElements() >= NumElts; + + return false; +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -13764,8 +13786,7 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc, SDValue Pg = N->getOperand(1); // ISD way to specify an all active predicate. - if ((Pg.getOpcode() == AArch64ISD::PTRUE) && - (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) + if (isAllActivePredicate(Pg)) return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, N->getOperand(2), N->getOperand(3)); @@ -13858,6 +13879,12 @@ static SDValue performIntrinsicCombine(SDNode *N, N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_mul: + return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG); + case Intrinsic::aarch64_sve_smulh: + return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG); + case Intrinsic::aarch64_sve_umulh: + return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG); case Intrinsic::aarch64_sve_smin: return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); case Intrinsic::aarch64_sve_umin: @@ -13872,6 +13899,12 @@ static SDValue performIntrinsicCombine(SDNode *N, return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); case Intrinsic::aarch64_sve_asr: return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); + case Intrinsic::aarch64_sve_fadd: + return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG); + case Intrinsic::aarch64_sve_fsub: + return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG); + case Intrinsic::aarch64_sve_fmul: + return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), @@ -17613,3 +17646,7 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, return Op; } + +bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const { + return ::isAllActivePredicate(N); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 8af8417..90f77a7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -820,6 +820,8 @@ public: return 128; } + bool isAllActivePredicate(SDValue N) const; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 634860f..6e92c92 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1378,9 +1378,9 @@ let Predicates = [HasSVE] in { defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; - defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; - defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; - defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; + defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>; + defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>; + defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>; // Predicated shifts defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; @@ -2410,24 +2410,6 @@ let Predicates = [HasSVE2] in { defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>; defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; - // Add patterns for unpredicated version of smulh and umulh. - def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), - (SMULH_ZZZ_B $Op1, $Op2)>; - def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), - (SMULH_ZZZ_H $Op1, $Op2)>; - def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), - (SMULH_ZZZ_S $Op1, $Op2)>; - def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), - (SMULH_ZZZ_D $Op1, $Op2)>; - def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), - (UMULH_ZZZ_B $Op1, $Op2)>; - def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)), - (UMULH_ZZZ_H $Op1, $Op2)>; - def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)), - (UMULH_ZZZ_S $Op1, $Op2)>; - def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), - (UMULH_ZZZ_D $Op1, $Op2)>; - // SVE2 complex integer dot product (indexed) defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 7e05a5ce..3f6cdad 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -221,6 +221,8 @@ def SVEShiftImmR16 : ComplexPattern", [] def SVEShiftImmR32 : ComplexPattern", []>; def SVEShiftImmR64 : ComplexPattern", []>; +def SVEAllActive : ComplexPattern; + class SVEExactFPImm : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; let DiagnosticType = "Invalid" # Name; @@ -339,9 +341,9 @@ class SVE_1_Op_Imm_OptLsl_Pat; -class SVE_1_Op_Imm_Arith_Pred_Pat - : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), (inst $Op1, i32:$imm)>; class SVE_1_Op_Imm_Log_Pat -: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)), +: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)), (inst $Op1, $Op2)>; class SVE_3_Op_Pat -: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), +: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), (inst $Rn, i32:$imm)>; // @@ -4052,10 +4054,10 @@ multiclass sve_int_arith_imm1 opc, string asm, SDPatternOperator op> { def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperator op> { @@ -4064,10 +4066,10 @@ multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperato def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } multiclass sve_int_arith_imm2 { @@ -4076,10 +4078,10 @@ multiclass sve_int_arith_imm2 { def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -5169,10 +5171,14 @@ class sve_int_bin_cons_shift_wide sz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_wide opc, string asm> { +multiclass sve_int_bin_cons_shift_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>; + + def : SVE_2_Op_Pred_All_Active(NAME # _B)>; + def : SVE_2_Op_Pred_All_Active(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active(NAME # _S)>; } class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll index 33c347d..c978a72 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -110,6 +110,52 @@ define @smax_i64_out_of_range( %a) { ret %out } +; As smax_i32 but where pg is i8 based and thus compatible for i32. +define @smax_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_b: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smax_i32 but where pg is i16 based and thus compatible for i32. +define @smax_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_h: +; CHECK: smax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @smax_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: smax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} ; SMIN @@ -220,6 +266,53 @@ define @smin_i64_out_of_range( %a) { ret %out } +; As smin_i32 but where pg is i8 based and thus compatible for i32. +define @smin_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_b: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smin_i32 but where pg is i16 based and thus compatible for i32. +define @smin_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_h: +; CHECK: smin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As smin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @smin_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: smin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.smin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; UMAX define @umax_i8( %a) { @@ -329,6 +422,53 @@ define @umax_i64_out_of_range( %a) { ret %out } +; As umax_i32 but where pg is i8 based and thus compatible for i32. +define @umax_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_b: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umax_i32 but where pg is i16 based and thus compatible for i32. +define @umax_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_h: +; CHECK: umax z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umax_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umax_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: umax_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umax.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; UMIN define @umin_i8( %a) { @@ -438,6 +578,53 @@ define @umin_i64_out_of_range( %a) { ret %out } +; As umin_i32 but where pg is i8 based and thus compatible for i32. +define @umin_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_b: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umin_i32 but where pg is i16 based and thus compatible for i32. +define @umin_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_h: +; CHECK: umin z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umin_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umin_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: umin_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.umin.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; SQADD define @sqadd_b_lowimm( %a) { @@ -660,6 +847,42 @@ define @uqadd_s_lowimm( %a) { ret %out } +define @uqadd_s_highimm( %a) { +; CHECK-LABEL: uqadd_s_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 +; CHECK-NEXT: ret + %elt = insertelement undef, i32 8192, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.x.nxv4i32( %a, + %splat) + ret %out +} + +define @uqadd_d_lowimm( %a) { +; CHECK-LABEL: uqadd_d_lowimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff +; CHECK-NEXT: ret + %elt = insertelement undef, i64 255, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, + %splat) + ret %out +} + +define @uqadd_d_highimm( %a) { +; CHECK-LABEL: uqadd_d_highimm: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 +; CHECK-NEXT: ret + %elt = insertelement undef, i64 65280, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, + %splat) + ret %out +} + ; UQSUB define @uqsub_b_lowimm( %a) { @@ -746,43 +969,6 @@ define @uqsub_d_highimm( %a) { ret %out } - -define @uqadd_s_highimm( %a) { -; CHECK-LABEL: uqadd_s_highimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000 -; CHECK-NEXT: ret - %elt = insertelement undef, i32 8192, i32 0 - %splat = shufflevector %elt, undef, zeroinitializer - %out = call @llvm.aarch64.sve.uqadd.x.nxv4i32( %a, - %splat) - ret %out -} - -define @uqadd_d_lowimm( %a) { -; CHECK-LABEL: uqadd_d_lowimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff -; CHECK-NEXT: ret - %elt = insertelement undef, i64 255, i32 0 - %splat = shufflevector %elt, undef, zeroinitializer - %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, - %splat) - ret %out -} - -define @uqadd_d_highimm( %a) { -; CHECK-LABEL: uqadd_d_highimm: -; CHECK: // %bb.0: -; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00 -; CHECK-NEXT: ret - %elt = insertelement undef, i64 65280, i32 0 - %splat = shufflevector %elt, undef, zeroinitializer - %out = call @llvm.aarch64.sve.uqadd.x.nxv2i64( %a, - %splat) - ret %out -} - ; ASR define @asr_i8( %pg, %a) { @@ -1321,6 +1507,103 @@ define @lsr_i64_too_small( %pg, %out } +; As lsr_i32 but where pg is i8 based and thus compatible for i32. +define @lsr_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_b: +; CHECK: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.lsr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i16 based and thus compatible for i32. +define @lsr_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_h: +; CHECK: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.lsr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @lsr_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, #1 +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.lsr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; MUL +; + +; As mul_i32 but where pg is i8 based and thus compatible for i32. +define @mul_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_b: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As mul_i32 but where pg is i16 based and thus compatible for i32. +define @mul_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_h: +; CHECK: mul z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As mul_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @mul_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: mul_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.mul.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + declare @llvm.aarch64.sve.sqadd.x.nxv16i8(, ) declare @llvm.aarch64.sve.sqadd.x.nxv8i16(, ) declare @llvm.aarch64.sve.sqadd.x.nxv4i32(, ) @@ -1376,6 +1659,21 @@ declare @llvm.aarch64.sve.lsr.nxv8i16(, @llvm.aarch64.sve.lsr.nxv4i32(, , ) declare @llvm.aarch64.sve.lsr.nxv2i64(, , ) +declare @llvm.aarch64.sve.mul.nxv16i8(, , ) +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) + declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern) declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll new file mode 100644 index 0000000..3800d37 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll @@ -0,0 +1,509 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; MUL +; + +define @mul_i8( %a, %b) #0 { +; CHECK-LABEL: mul_i8: +; CHECK: mul z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @mul_i16( %a, %b) #0 { +; CHECK-LABEL: mul_i16: +; CHECK: mul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @mul_i32( %a, %b) #0 { +; CHECK-LABEL: mul_i32: +; CHECK: mul z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @mul_i64( %a, %b) #0 { +; CHECK-LABEL: mul_i64: +; CHECK: mul z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.mul.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; SMULH +; + +define @smulh_i8( %a, %b) #0 { +; CHECK-LABEL: smulh_i8: +; CHECK: smulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @smulh_i16( %a, %b) #0 { +; CHECK-LABEL: smulh_i16: +; CHECK: smulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @smulh_i32( %a, %b) #0 { +; CHECK-LABEL: smulh_i32: +; CHECK: smulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @smulh_i64( %a, %b) #0 { +; CHECK-LABEL: smulh_i64: +; CHECK: smulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.smulh.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; +; UMULH +; + +define @umulh_i8( %a, %b) #0 { +; CHECK-LABEL: umulh_i8: +; CHECK: umulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @umulh_i16( %a, %b) #0 { +; CHECK-LABEL: umulh_i16: +; CHECK: umulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @umulh_i32( %a, %b) #0 { +; CHECK-LABEL: umulh_i32: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv4i32( %pg, + %a, + %b) + ret %out +} + +define @umulh_i64( %a, %b) #0 { +; CHECK-LABEL: umulh_i64: +; CHECK: umulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.umulh.nxv2i64( %pg, + %a, + %b) + ret %out +} + +; As umulh_i32 but where pg is i8 based and thus compatible for i32. +define @umulh_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_b: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.umulh.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umulh_i32 but where pg is i16 based and thus compatible for i32. +define @umulh_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_h: +; CHECK: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.umulh.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As umulh_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @umulh_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: umulh_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: umulh z0.s, [[PG]]/m, z0.s, z1.s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.umulh.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; ASR (wide) +; + +define @asr_i8( %a, %b) #0 { +; CHECK-LABEL: asr_i8: +; CHECK: asr z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.asr.wide.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @asr_i16( %a, %b) #0 { +; CHECK-LABEL: asr_i16: +; CHECK: asr z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.asr.wide.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @asr_i32( %a, %b) #0 { +; CHECK-LABEL: asr_i32: +; CHECK: asr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.asr.wide.nxv4i32( %pg, + %a, + %b) + ret %out +} + +; +; LSL (wide) +; + +define @lsl_i8( %a, %b) #0 { +; CHECK-LABEL: lsl_i8: +; CHECK: lsl z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.lsl.wide.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @lsl_i16( %a, %b) #0 { +; CHECK-LABEL: lsl_i16: +; CHECK: lsl z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.lsl.wide.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @lsl_i32( %a, %b) #0 { +; CHECK-LABEL: lsl_i32: +; CHECK: lsl z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.lsl.wide.nxv4i32( %pg, + %a, + %b) + ret %out +} + +; +; LSR (wide) +; + +define @lsr_i8( %a, %b) #0 { +; CHECK-LABEL: lsr_i8: +; CHECK: lsr z0.b, z0.b, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %out = call @llvm.aarch64.sve.lsr.wide.nxv16i8( %pg, + %a, + %b) + ret %out +} + +define @lsr_i16( %a, %b) #0 { +; CHECK-LABEL: lsr_i16: +; CHECK: lsr z0.h, z0.h, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.lsr.wide.nxv8i16( %pg, + %a, + %b) + ret %out +} + +define @lsr_i32( %a, %b) #0 { +; CHECK-LABEL: lsr_i32: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i8 based and thus compatible for i32. +define @lsr_i32_ptrue_all_b( %a, %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_b: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i16 based and thus compatible for i32. +define @lsr_i32_ptrue_all_h( %a, %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_h: +; CHECK: lsr z0.s, z0.s, z1.d +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @lsr_i32_ptrue_all_d( %a, %b) #0 { +; CHECK-LABEL: lsr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, z1.d +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %out = tail call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; +; FADD +; + +define @fadd_half( %a, %b) #0 { +; CHECK-LABEL: fadd_half: +; CHECK: fadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.fadd.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fadd_float( %a, %b) #0 { +; CHECK-LABEL: fadd_float: +; CHECK: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.fadd.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fadd_double( %a, %b) #0 { +; CHECK-LABEL: fadd_double: +; CHECK: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.fadd.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FSUB +; + +define @fsub_half( %a, %b) #0 { +; CHECK-LABEL: fsub_half: +; CHECK: fsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.fsub.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fsub_float( %a, %b) #0 { +; CHECK-LABEL: fsub_float: +; CHECK: fsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.fsub.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fsub_double( %a, %b) #0 { +; CHECK-LABEL: fsub_double: +; CHECK: fsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.fsub.nxv2f64( %pg, + %a, + %b) + ret %out +} + +; +; FMUL +; + +define @fmul_half( %a, %b) #0 { +; CHECK-LABEL: fmul_half: +; CHECK: fmul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %out = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, + %a, + %b) + ret %out +} + +define @fmul_float( %a, %b) #0 { +; CHECK-LABEL: fmul_float: +; CHECK: fmul z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %out = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, + %a, + %b) + ret %out +} + +define @fmul_double( %a, %b) #0 { +; CHECK-LABEL: fmul_double: +; CHECK: fmul z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %out = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, + %a, + %b) + ret %out +} + +declare @llvm.aarch64.sve.mul.nxv16i8(, , ) +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +declare @llvm.aarch64.sve.smulh.nxv16i8(, , ) +declare @llvm.aarch64.sve.smulh.nxv8i16(, , ) +declare @llvm.aarch64.sve.smulh.nxv4i32(, , ) +declare @llvm.aarch64.sve.smulh.nxv2i64(, , ) + +declare @llvm.aarch64.sve.umulh.nxv16i8(, , ) +declare @llvm.aarch64.sve.umulh.nxv8i16(, , ) +declare @llvm.aarch64.sve.umulh.nxv4i32(, , ) +declare @llvm.aarch64.sve.umulh.nxv2i64(, , ) + +declare @llvm.aarch64.sve.asr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.lsl.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.lsr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.fadd.nxv8f16(, , ) +declare @llvm.aarch64.sve.fadd.nxv4f32(, , ) +declare @llvm.aarch64.sve.fadd.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fsub.nxv8f16(, , ) +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) +declare @llvm.aarch64.sve.fsub.nxv2f64(, , ) + +declare @llvm.aarch64.sve.fmul.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmul.nxv2f64(, , ) + +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +attributes #0 = { "target-features"="+sve2" }