[AArch64][SVE] Better utilisation of unpredicated forms of arithmetic intrinsics

author Bradley Smith <bradley.smith@arm.com>

Fri, 30 Apr 2021 15:17:37 +0000 (16:17 +0100)

committer Bradley Smith <bradley.smith@arm.com>

Mon, 10 May 2021 12:05:37 +0000 (13:05 +0100)
author Bradley Smith <bradley.smith@arm.com>
Fri, 30 Apr 2021 15:17:37 +0000 (16:17 +0100)
committer Bradley Smith <bradley.smith@arm.com>
Mon, 10 May 2021 12:05:37 +0000 (13:05 +0100)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

index 3d8aa29..d7c1e32 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -335,6 +335,8 @@ private:
    bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
    bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
                                 SDValue &Offset);
+
+  bool SelectAllActivePredicate(SDValue N);
  };
  } // end anonymous namespace
  
@@ -4983,3 +4985,10 @@ bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
  
    return false;
  }
+
+bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
+  const AArch64TargetLowering *TLI =
+      static_cast<const AArch64TargetLowering *>(getTargetLowering());
+
+  return TLI->isAllActivePredicate(N);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index 6b1105d..3a71405 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13754,6 +13754,28 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
                       Zero);
  }
  
+static bool isAllActivePredicate(SDValue N) {
+  unsigned NumElts = N.getValueType().getVectorMinNumElements();
+
+  // Look through cast.
+  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
+    N = N.getOperand(0);
+    // When reinterpreting from a type with fewer elements the "new" elements
+    // are not active, so bail if they're likely to be used.
+    if (N.getValueType().getVectorMinNumElements() < NumElts)
+      return false;
+  }
+
+  // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
+  // or smaller than the implicit element type represented by N.
+  // NOTE: A larger element count implies a smaller element type.
+  if (N.getOpcode() == AArch64ISD::PTRUE &&
+      N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
+    return N.getValueType().getVectorMinNumElements() >= NumElts;
+
+  return false;
+}
+
  // If a merged operation has no inactive lanes we can relax it to a predicated
  // or unpredicated operation, which potentially allows better isel (perhaps
  // using immediate forms) or relaxing register reuse requirements.
@@ -13764,8 +13786,7 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc,
    SDValue Pg = N->getOperand(1);
  
    // ISD way to specify an all active predicate.
-  if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
-      (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
+  if (isAllActivePredicate(Pg))
      return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
                         N->getOperand(2), N->getOperand(3));
  
@@ -13858,6 +13879,12 @@ static SDValue performIntrinsicCombine(SDNode *N,
                         N->getOperand(1));
    case Intrinsic::aarch64_sve_ext:
      return LowerSVEIntrinsicEXT(N, DAG);
+  case Intrinsic::aarch64_sve_mul:
+    return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
+  case Intrinsic::aarch64_sve_smulh:
+    return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
+  case Intrinsic::aarch64_sve_umulh:
+    return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
    case Intrinsic::aarch64_sve_smin:
      return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
    case Intrinsic::aarch64_sve_umin:
@@ -13872,6 +13899,12 @@ static SDValue performIntrinsicCombine(SDNode *N,
      return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
    case Intrinsic::aarch64_sve_asr:
      return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
+  case Intrinsic::aarch64_sve_fadd:
+    return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
+  case Intrinsic::aarch64_sve_fsub:
+    return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
+  case Intrinsic::aarch64_sve_fmul:
+    return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
    case Intrinsic::aarch64_sve_cmphs:
      if (!N->getOperand(2).getValueType().isFloatingPoint())
        return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
@@ -17613,3 +17646,7 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
  
    return Op;
  }
+
+bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const {
+  return ::isAllActivePredicate(N);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h

index 8af8417..90f77a7 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -820,6 +820,8 @@ public:
      return 128;
    }
  
+  bool isAllActivePredicate(SDValue N) const;
+
  private:
    /// Keep a pointer to the AArch64Subtarget around so that we can
    /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

index 634860f..6e92c92 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1378,9 +1378,9 @@ let Predicates = [HasSVE] in {
    defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>;
    defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>;
  
-  defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
-  defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
-  defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
+  defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>;
+  defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>;
+  defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>;
  
    // Predicated shifts
    defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr",  "ASR_ZPZI",  int_aarch64_sve_asr>;
@@ -2410,24 +2410,6 @@ let Predicates = [HasSVE2] in {
    defm UMULH_ZZZ  : sve2_int_mul<0b011,  "umulh", null_frag, AArch64umulh_p>;
    defm PMUL_ZZZ   : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
  
-  // Add patterns for unpredicated version of smulh and umulh.
-  def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
-            (SMULH_ZZZ_B $Op1, $Op2)>;
-  def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
-            (SMULH_ZZZ_H $Op1, $Op2)>;
-  def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
-            (SMULH_ZZZ_S $Op1, $Op2)>;
-  def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
-            (SMULH_ZZZ_D $Op1, $Op2)>;
-  def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
-            (UMULH_ZZZ_B $Op1, $Op2)>;
-  def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
-            (UMULH_ZZZ_H $Op1, $Op2)>;
-  def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
-            (UMULH_ZZZ_S $Op1, $Op2)>;
-  def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
-            (UMULH_ZZZ_D $Op1, $Op2)>;
-
    // SVE2 complex integer dot product (indexed)
    defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
  
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td

index 7e05a5c..3f6cdad 100644 (file)
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -221,6 +221,8 @@ def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []
  def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
  def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
  
+def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>;
+
  class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
    let Name = "SVEExactFPImmOperand" # Suffix;
    let DiagnosticType = "Invalid" # Name;
@@ -339,9 +341,9 @@ class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty
    : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
          (inst $Op1, i32:$imm, i32:$shift)>;
  
-class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op,
                                    ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
-  : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+  : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
          (inst $Op1, i32:$imm)>;
  
  class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
@@ -357,7 +359,7 @@ class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
  class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
                                 ValueType pt, ValueType vt1, ValueType vt2,
                                 Instruction inst>
-: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)),
+: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)),
        (inst $Op1, $Op2)>;
  
  class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
@@ -432,7 +434,7 @@ class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
  class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
                                        ValueType pt, ValueType it,
                                        ComplexPattern cast, Instruction inst>
-: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
        (inst $Rn, i32:$imm)>;
  
  //
@@ -4052,10 +4054,10 @@ multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> {
    def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
    def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
  
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1,  op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1,  op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
  }
  
  multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
@@ -4064,10 +4066,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato
    def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
    def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
  
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
  }
  
  multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
@@ -4076,10 +4078,10 @@ multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
    def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
    def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
  
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
  }
  
  //===----------------------------------------------------------------------===//
@@ -5169,10 +5171,14 @@ class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
    let Inst{4-0}   = Zd;
  }
  
-multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm, SDPatternOperator op> {
    def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
    def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
    def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
+
+  def : SVE_2_Op_Pred_All_Active<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pred_All_Active<nxv8i16, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pred_All_Active<nxv4i32, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
  }
  
  class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll

index 33c347d..c978a72 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
@@ -110,6 +110,52 @@ define <vscale x 2 x i64> @smax_i64_out_of_range(<vscale x 2 x i64> %a) {
    ret <vscale x 2 x i64> %out
  }
  
+; As smax_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @smax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smax_i32_ptrue_all_b:
+; CHECK: smax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As smax_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @smax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smax_i32_ptrue_all_h:
+; CHECK: smax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As smax_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @smax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smax_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
  
  ; SMIN
  
@@ -220,6 +266,53 @@ define <vscale x 2 x i64> @smin_i64_out_of_range(<vscale x 2 x i64> %a) {
    ret <vscale x 2 x i64> %out
  }
  
+; As smin_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @smin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smin_i32_ptrue_all_b:
+; CHECK: smin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As smin_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @smin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smin_i32_ptrue_all_h:
+; CHECK: smin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As smin_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @smin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smin_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
  ; UMAX
  
  define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a) {
@@ -329,6 +422,53 @@ define <vscale x 2 x i64> @umax_i64_out_of_range(<vscale x 2 x i64> %a) {
    ret <vscale x 2 x i64> %out
  }
  
+; As umax_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @umax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umax_i32_ptrue_all_b:
+; CHECK: umax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As umax_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @umax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umax_i32_ptrue_all_h:
+; CHECK: umax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As umax_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @umax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umax_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
  ; UMIN
  
  define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a) {
@@ -438,6 +578,53 @@ define <vscale x 2 x i64> @umin_i64_out_of_range(<vscale x 2 x i64> %a) {
    ret <vscale x 2 x i64> %out
  }
  
+; As umin_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @umin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umin_i32_ptrue_all_b:
+; CHECK: umin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As umin_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @umin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umin_i32_ptrue_all_h:
+; CHECK: umin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As umin_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @umin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umin_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
  ; SQADD
  
  define <vscale x 16 x i8> @sqadd_b_lowimm(<vscale x 16 x i8> %a) {
@@ -660,6 +847,42 @@ define <vscale x 4 x i32> @uqadd_s_lowimm(<vscale x 4 x i32> %a) {
    ret <vscale x 4 x i32> %out
  }
  
+define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: uqadd_s_highimm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.s, z0.s, #8192 // =0x2000
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a,
+                                                                   <vscale x 4 x i32> %splat)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uqadd_d_lowimm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.d, z0.d, #255 // =0xff
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
+                                                                   <vscale x 2 x i64> %splat)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uqadd_d_highimm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.d, z0.d, #65280 // =0xff00
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
+                                                                   <vscale x 2 x i64> %splat)
+  ret <vscale x 2 x i64> %out
+}
+
  ; UQSUB
  
  define <vscale x 16 x i8> @uqsub_b_lowimm(<vscale x 16 x i8> %a) {
@@ -746,43 +969,6 @@ define <vscale x 2 x i64> @uqsub_d_highimm(<vscale x 2 x i64> %a) {
    ret <vscale x 2 x i64> %out
  }
  
-
-define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: uqadd_s_highimm:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd z0.s, z0.s, #8192 // =0x2000
-; CHECK-NEXT:    ret
-  %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
-  %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a,
-                                                                   <vscale x 4 x i32> %splat)
-  ret <vscale x 4 x i32> %out
-}
-
-define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: uqadd_d_lowimm:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd z0.d, z0.d, #255 // =0xff
-; CHECK-NEXT:    ret
-  %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
-  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
-  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
-                                                                   <vscale x 2 x i64> %splat)
-  ret <vscale x 2 x i64> %out
-}
-
-define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: uqadd_d_highimm:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd z0.d, z0.d, #65280 // =0xff00
-; CHECK-NEXT:    ret
-  %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
-  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
-  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
-                                                                   <vscale x 2 x i64> %splat)
-  ret <vscale x 2 x i64> %out
-}
-
  ; ASR
  
  define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
@@ -1321,6 +1507,103 @@ define <vscale x 2 x i64> @lsr_i64_too_small(<vscale x 2 x i1> %pg, <vscale x 2
    ret <vscale x 2 x i64> %out
  }
  
+; As lsr_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_b:
+; CHECK: lsr z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_h:
+; CHECK: lsr z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; MUL
+;
+
+; As mul_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @mul_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: mul_i32_ptrue_all_b:
+; CHECK: mul z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As mul_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @mul_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: mul_i32_ptrue_all_h:
+; CHECK: mul z0.s, z0.s, #1
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As mul_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @mul_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: mul_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                     <vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
  declare <vscale x 16 x i8> @llvm.aarch64.sve.sqadd.x.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
  declare <vscale x 8 x i16> @llvm.aarch64.sve.sqadd.x.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
  declare <vscale x 4 x i32> @llvm.aarch64.sve.sqadd.x.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -1376,6 +1659,21 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1>, <vsc
  declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
  declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
  
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+
  declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern)
  declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern)
  declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll

new file mode 100644 (file)

index 0000000..3800d37
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll
@@ -0,0 +1,509 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; MUL
+;
+
+define <vscale x 16 x i8> @mul_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: mul_i8:
+; CHECK: mul z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @mul_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: mul_i16:
+; CHECK: mul z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @mul_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: mul_i32:
+; CHECK: mul z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @mul_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: mul_i64:
+; CHECK: mul z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SMULH
+;
+
+define <vscale x 16 x i8> @smulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: smulh_i8:
+; CHECK: smulh z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %a,
+                                                                 <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @smulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: smulh_i16:
+; CHECK: smulh z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %a,
+                                                                 <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @smulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: smulh_i32:
+; CHECK: smulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %a,
+                                                                 <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: smulh_i64:
+; CHECK: smulh z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %a,
+                                                                 <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; UMULH
+;
+
+define <vscale x 16 x i8> @umulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: umulh_i8:
+; CHECK: umulh z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %a,
+                                                                 <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @umulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: umulh_i16:
+; CHECK: umulh z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %a,
+                                                                 <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @umulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32:
+; CHECK: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %a,
+                                                                 <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: umulh_i64:
+; CHECK: umulh z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %a,
+                                                                 <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+; As umulh_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @umulh_i32_ptrue_all_b(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32_ptrue_all_b:
+; CHECK: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                      <vscale x 4 x i32> %a,
+                                                                      <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As umulh_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @umulh_i32_ptrue_all_h(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32_ptrue_all_h:
+; CHECK: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                      <vscale x 4 x i32> %a,
+                                                                      <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As umulh_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @umulh_i32_ptrue_all_d(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: umulh z0.s, [[PG]]/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                      <vscale x 4 x i32> %a,
+                                                                      <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; ASR (wide)
+;
+
+define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: asr_i8:
+; CHECK: asr z0.b, z0.b, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                    <vscale x 16 x i8> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: asr_i16:
+; CHECK: asr z0.h, z0.h, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x i16> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: asr_i32:
+; CHECK: asr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; LSL (wide)
+;
+
+define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsl_i8:
+; CHECK: lsl z0.b, z0.b, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                    <vscale x 16 x i8> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsl_i16:
+; CHECK: lsl z0.h, z0.h, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x i16> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsl_i32:
+; CHECK: lsl z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; LSR (wide)
+;
+
+define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i8:
+; CHECK: lsr z0.b, z0.b, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                    <vscale x 16 x i8> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i16:
+; CHECK: lsr z0.h, z0.h, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x i16> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32:
+; CHECK: lsr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_b:
+; CHECK: lsr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                         <vscale x 4 x i32> %a,
+                                                                         <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_h:
+; CHECK: lsr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                         <vscale x 4 x i32> %a,
+                                                                         <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, z1.d
+; CHECK-NEXT: ret
+  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+  %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s,
+                                                                         <vscale x 4 x i32> %a,
+                                                                         <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; FADD
+;
+
+define <vscale x 8 x half> @fadd_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fadd_half:
+; CHECK: fadd z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fadd_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: fadd_float:
+; CHECK: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fadd_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: fadd_double:
+; CHECK: fadd z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FSUB
+;
+
+define <vscale x 8 x half> @fsub_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fsub_half:
+; CHECK: fsub z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fsub_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: fsub_float:
+; CHECK: fsub z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fsub_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: fsub_double:
+; CHECK: fsub z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMUL
+;
+
+define <vscale x 8 x half> @fmul_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fmul_half:
+; CHECK: fmul z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmul_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: fmul_float:
+; CHECK: fmul z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmul_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: fmul_double:
+; CHECK: fmul z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+declare <vscale x 16 x  i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x  i8>, <vscale x 16 x  i8>)
+declare <vscale x  8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x  8 x i1>, <vscale x  8 x i16>, <vscale x  8 x i16>)
+declare <vscale x  4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x  4 x i1>, <vscale x  4 x i32>, <vscale x  4 x i32>)
+declare <vscale x  2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x  2 x i1>, <vscale x  2 x i64>, <vscale x  2 x i64>)
+
+declare <vscale x 16 x  i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x  i8>, <vscale x 16 x  i8>)
+declare <vscale x  8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x  8 x i1>, <vscale x  8 x i16>, <vscale x  8 x i16>)
+declare <vscale x  4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x  4 x i1>, <vscale x  4 x i32>, <vscale x  4 x i32>)
+declare <vscale x  2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x  2 x i1>, <vscale x  2 x i64>, <vscale x  2 x i64>)
+
+declare <vscale x 16 x  i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x  i8>, <vscale x 16 x  i8>)
+declare <vscale x  8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x  8 x i1>, <vscale x  8 x i16>, <vscale x  8 x i16>)
+declare <vscale x  4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x  4 x i1>, <vscale x  4 x i32>, <vscale x  4 x i32>)
+declare <vscale x  2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x  2 x i1>, <vscale x  2 x i64>, <vscale x  2 x i64>)
+
+declare <vscale x 16 x  i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x  i8>, <vscale x 2 x i64>)
+declare <vscale x  8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x  8 x i1>, <vscale x  8 x i16>, <vscale x 2 x i64>)
+declare <vscale x  4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x  4 x i1>, <vscale x  4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x  i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x  i8>, <vscale x 2 x i64>)
+declare <vscale x  8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x  8 x i1>, <vscale x  8 x i16>, <vscale x 2 x i64>)
+declare <vscale x  4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x  4 x i1>, <vscale x  4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x  i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x  i8>, <vscale x 2 x i64>)
+declare <vscale x  8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x  8 x i1>, <vscale x  8 x i16>, <vscale x 2 x i64>)
+declare <vscale x  4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x  4 x i1>, <vscale x  4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x   half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x   half>, <vscale x 8 x   half>)
+declare <vscale x 4 x  float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x  float>, <vscale x 4 x  float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x   half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x   half>, <vscale x 8 x   half>)
+declare <vscale x 4 x  float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x  float>, <vscale x 4 x  float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x   half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x   half>, <vscale x 8 x   half>)
+declare <vscale x 4 x  float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x  float>, <vscale x 4 x  float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+attributes #0 = { "target-features"="+sve2" }
author	Bradley Smith <bradley.smith@arm.com>
	Fri, 30 Apr 2021 15:17:37 +0000 (16:17 +0100)
committer	Bradley Smith <bradley.smith@arm.com>
	Mon, 10 May 2021 12:05:37 +0000 (13:05 +0100)
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64ISelLowering.h		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td		patch \| blob \| history
llvm/lib/Target/AArch64/SVEInstrFormats.td		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-intrinsics-unpred-form.ll	[new file with mode: 0644]	patch \| blob