[RISCV] Use VScaleForTuning in costing of operations whose cost depends on VL

author Philip Reames <preames@rivosinc.com>

Thu, 18 Aug 2022 19:56:54 +0000 (12:56 -0700)

committer Philip Reames <listmail@philipreames.com>

Thu, 18 Aug 2022 20:10:03 +0000 (13:10 -0700)
author Philip Reames <preames@rivosinc.com>
Thu, 18 Aug 2022 19:56:54 +0000 (12:56 -0700)
committer Philip Reames <listmail@philipreames.com>
Thu, 18 Aug 2022 20:10:03 +0000 (13:10 -0700)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

index ef26a82..602bf3a 100644 (file)
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -240,12 +240,12 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
                                           Alignment, CostKind, I);
  
    // Cost is proportional to the number of memory operations implied.  For
-  // scalable vectors, we use an upper bound on that number since we don't
+  // scalable vectors, we use an estimate on that number since we don't
    // know exactly what VL will be.
    auto &VTy = *cast<VectorType>(DataTy);
    InstructionCost MemOpCost = getMemoryOpCost(Opcode, VTy.getElementType(),
                                                Alignment, 0, CostKind, I);
-  unsigned NumLoads = getMaxVLFor(&VTy);
+  unsigned NumLoads = getEstimatedVLFor(&VTy);
    return NumLoads * MemOpCost;
  }
  
@@ -343,12 +343,12 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  }
  
-unsigned RISCVTTIImpl::getMaxVLFor(VectorType *Ty) {
+unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
    if (isa<ScalableVectorType>(Ty)) {
      const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
      const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
-    const unsigned VectorBitsMax = ST->getRealMaxVLen();
-    return RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
+    const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
+    return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
    }
    return cast<FixedVectorType>(Ty)->getNumElements();
  }
@@ -372,7 +372,7 @@ RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
  
    // IR Reduction is composed by two vmv and one rvv reduction instruction.
    InstructionCost BaseCost = 2;
-  unsigned VL = getMaxVLFor(Ty);
+  unsigned VL = getEstimatedVLFor(Ty);
    return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
  }
  
@@ -401,7 +401,7 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
  
    // IR Reduction is composed by two vmv and one rvv reduction instruction.
    InstructionCost BaseCost = 2;
-  unsigned VL = getMaxVLFor(Ty);
+  unsigned VL = getEstimatedVLFor(Ty);
    if (TTI::requiresOrderedReduction(FMF))
      return (LT.first - 1) + BaseCost + VL;
    return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

index c36d842..0a98082 100644 (file)
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -37,7 +37,15 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
    const RISCVSubtarget *getST() const { return ST; }
    const RISCVTargetLowering *getTLI() const { return TLI; }
  
-  unsigned getMaxVLFor(VectorType *Ty);
+  /// This function returns an estimate for VL to be used in VL based terms
+  /// of the cost model.  For fixed length vectors, this is simply the
+  /// vector length.  For scalable vectors, we return results consistent
+  /// with getVScaleForTuning under the assumption that clients are also
+  /// using that when comparing costs between scalar and vector representation.
+  /// This does unfortunately mean that we can both undershoot and overshot
+  /// the true cost significantly if getVScaleForTuning is wildly off for the
+  /// actual target hardware.
+  unsigned getEstimatedVLFor(VectorType *Ty);
  public:
    explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
        : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll

index 8095ab7..0234625 100644 (file)
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll
@@ -6,7 +6,7 @@ declare half @llvm.vector.reduce.fadd.nxv1f16(half, <vscale x 1 x half>)
  
  define half @vreduce_fadd_nxv1f16(<vscale x 1 x half> %v, half %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv1f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call reassoc half @llvm.vector.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call reassoc half @llvm.vector.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call reassoc half @llvm.vector.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v)
@@ -15,7 +15,7 @@ define half @vreduce_fadd_nxv1f16(<vscale x 1 x half> %v, half %s) {
  
  define half @vreduce_ord_fadd_nxv1f16(<vscale x 1 x half> %v, half %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv1f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1026 for instruction: %red = call half @llvm.vector.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call half @llvm.vector.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v)
@@ -26,7 +26,7 @@ declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
  
  define half @vreduce_fadd_nxv2f16(<vscale x 2 x half> %v, half %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv2f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call reassoc half @llvm.vector.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call reassoc half @llvm.vector.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call reassoc half @llvm.vector.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v)
@@ -35,7 +35,7 @@ define half @vreduce_fadd_nxv2f16(<vscale x 2 x half> %v, half %s) {
  
  define half @vreduce_ord_fadd_nxv2f16(<vscale x 2 x half> %v, half %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv2f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2050 for instruction: %red = call half @llvm.vector.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %red = call half @llvm.vector.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v)
@@ -46,7 +46,7 @@ declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
  
  define half @vreduce_fadd_nxv4f16(<vscale x 4 x half> %v, half %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv4f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call reassoc half @llvm.vector.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call reassoc half @llvm.vector.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call reassoc half @llvm.vector.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v)
@@ -55,7 +55,7 @@ define half @vreduce_fadd_nxv4f16(<vscale x 4 x half> %v, half %s) {
  
  define half @vreduce_ord_fadd_nxv4f16(<vscale x 4 x half> %v, half %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv4f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4098 for instruction: %red = call half @llvm.vector.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call half @llvm.vector.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v)
@@ -66,7 +66,7 @@ declare float @llvm.vector.reduce.fadd.nxv1f32(float, <vscale x 1 x float>)
  
  define float @vreduce_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv1f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
@@ -75,7 +75,7 @@ define float @vreduce_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
  
  define float @vreduce_ord_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv1f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1026 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
@@ -85,7 +85,7 @@ define float @vreduce_ord_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
  define float @vreduce_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_fwadd_nxv1f32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 1 x half> %v to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %e = fpext <vscale x 1 x half> %v to <vscale x 1 x float>
@@ -96,7 +96,7 @@ define float @vreduce_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
  define float @vreduce_ord_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_ord_fwadd_nxv1f32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 1 x half> %v to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1026 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %e = fpext <vscale x 1 x half> %v to <vscale x 1 x float>
@@ -108,7 +108,7 @@ declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
  
  define float @vreduce_fadd_nxv2f32(<vscale x 2 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv2f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v)
@@ -117,7 +117,7 @@ define float @vreduce_fadd_nxv2f32(<vscale x 2 x float> %v, float %s) {
  
  define float @vreduce_ord_fadd_nxv2f32(<vscale x 2 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv2f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2050 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v)
@@ -127,7 +127,7 @@ define float @vreduce_ord_fadd_nxv2f32(<vscale x 2 x float> %v, float %s) {
  define float @vreduce_fwadd_nxv2f32(<vscale x 2 x half> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_fwadd_nxv2f32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 2 x half> %v to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %e = fpext <vscale x 2 x half> %v to <vscale x 2 x float>
@@ -138,7 +138,7 @@ define float @vreduce_fwadd_nxv2f32(<vscale x 2 x half> %v, float %s) {
  define float @vreduce_ord_fwadd_nxv2f32(<vscale x 2 x half> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_ord_fwadd_nxv2f32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 2 x half> %v to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2050 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %e = fpext <vscale x 2 x half> %v to <vscale x 2 x float>
@@ -150,7 +150,7 @@ declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
  
  define float @vreduce_fadd_nxv4f32(<vscale x 4 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv4f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v)
@@ -159,7 +159,7 @@ define float @vreduce_fadd_nxv4f32(<vscale x 4 x float> %v, float %s) {
  
  define float @vreduce_ord_fadd_nxv4f32(<vscale x 4 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv4f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4098 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v)
@@ -169,7 +169,7 @@ define float @vreduce_ord_fadd_nxv4f32(<vscale x 4 x float> %v, float %s) {
  define float @vreduce_fwadd_nxv4f32(<vscale x 4 x half> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_fwadd_nxv4f32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 4 x half> %v to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %e = fpext <vscale x 4 x half> %v to <vscale x 4 x float>
@@ -180,7 +180,7 @@ define float @vreduce_fwadd_nxv4f32(<vscale x 4 x half> %v, float %s) {
  define float @vreduce_ord_fwadd_nxv4f32(<vscale x 4 x half> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_ord_fwadd_nxv4f32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 4 x half> %v to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4098 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %e = fpext <vscale x 4 x half> %v to <vscale x 4 x float>
@@ -192,7 +192,7 @@ declare double @llvm.vector.reduce.fadd.nxv1f64(double, <vscale x 1 x double>)
  
  define double @vreduce_fadd_nxv1f64(<vscale x 1 x double> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv1f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v)
@@ -201,7 +201,7 @@ define double @vreduce_fadd_nxv1f64(<vscale x 1 x double> %v, double %s) {
  
  define double @vreduce_ord_fadd_nxv1f64(<vscale x 1 x double> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv1f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1026 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v)
@@ -211,7 +211,7 @@ define double @vreduce_ord_fadd_nxv1f64(<vscale x 1 x double> %v, double %s) {
  define double @vreduce_fwadd_nxv1f64(<vscale x 1 x float> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_fwadd_nxv1f64'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 1 x float> %v to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %e = fpext <vscale x 1 x float> %v to <vscale x 1 x double>
@@ -222,7 +222,7 @@ define double @vreduce_fwadd_nxv1f64(<vscale x 1 x float> %v, double %s) {
  define double @vreduce_ord_fwadd_nxv1f64(<vscale x 1 x float> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_ord_fwadd_nxv1f64'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 1 x float> %v to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1026 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %e = fpext <vscale x 1 x float> %v to <vscale x 1 x double>
@@ -234,7 +234,7 @@ declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
  
  define double @vreduce_fadd_nxv2f64(<vscale x 2 x double> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv2f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v)
@@ -243,7 +243,7 @@ define double @vreduce_fadd_nxv2f64(<vscale x 2 x double> %v, double %s) {
  
  define double @vreduce_ord_fadd_nxv2f64(<vscale x 2 x double> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv2f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2050 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v)
@@ -253,7 +253,7 @@ define double @vreduce_ord_fadd_nxv2f64(<vscale x 2 x double> %v, double %s) {
  define double @vreduce_fwadd_nxv2f64(<vscale x 2 x float> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_fwadd_nxv2f64'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 2 x float> %v to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %e = fpext <vscale x 2 x float> %v to <vscale x 2 x double>
@@ -264,7 +264,7 @@ define double @vreduce_fwadd_nxv2f64(<vscale x 2 x float> %v, double %s) {
  define double @vreduce_ord_fwadd_nxv2f64(<vscale x 2 x float> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_ord_fwadd_nxv2f64'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 2 x float> %v to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2050 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %e = fpext <vscale x 2 x float> %v to <vscale x 2 x double>
@@ -276,7 +276,7 @@ declare double @llvm.vector.reduce.fadd.nxv4f64(double, <vscale x 4 x double>)
  
  define double @vreduce_fadd_nxv4f64(<vscale x 4 x double> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_fadd_nxv4f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v)
@@ -285,7 +285,7 @@ define double @vreduce_fadd_nxv4f64(<vscale x 4 x double> %v, double %s) {
  
  define double @vreduce_ord_fadd_nxv4f64(<vscale x 4 x double> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_ord_fadd_nxv4f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4098 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v)
@@ -295,7 +295,7 @@ define double @vreduce_ord_fadd_nxv4f64(<vscale x 4 x double> %v, double %s) {
  define double @vreduce_fwadd_nxv4f64(<vscale x 4 x float> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_fwadd_nxv4f64'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 4 x float> %v to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %e = fpext <vscale x 4 x float> %v to <vscale x 4 x double>
@@ -306,7 +306,7 @@ define double @vreduce_fwadd_nxv4f64(<vscale x 4 x float> %v, double %s) {
  define double @vreduce_ord_fwadd_nxv4f64(<vscale x 4 x float> %v, double %s) {
  ; CHECK-LABEL: 'vreduce_ord_fwadd_nxv4f64'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fpext <vscale x 4 x float> %v to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4098 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %e = fpext <vscale x 4 x float> %v to <vscale x 4 x double>
@@ -318,7 +318,7 @@ declare half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half>)
  
  define half @vreduce_fmin_nxv1f16(<vscale x 1 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
@@ -327,7 +327,7 @@ define half @vreduce_fmin_nxv1f16(<vscale x 1 x half> %v) {
  
  define half @vreduce_fmin_nxv1f16_nonans(<vscale x 1 x half> %v) #0 {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f16_nonans'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call nnan half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
@@ -336,7 +336,7 @@ define half @vreduce_fmin_nxv1f16_nonans(<vscale x 1 x half> %v) #0 {
  
  define half @vreduce_fmin_nxv1f16_nonans_noinfs(<vscale x 1 x half> %v) #1 {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f16_nonans_noinfs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan ninf half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan ninf half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call nnan ninf half @llvm.vector.reduce.fmin.nxv1f16(<vscale x 1 x half> %v)
@@ -347,7 +347,7 @@ declare half @llvm.vector.reduce.fmin.nxv2f16(<vscale x 2 x half>)
  
  define half @vreduce_fmin_nxv2f16(<vscale x 2 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv2f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv2f16(<vscale x 2 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv2f16(<vscale x 2 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmin.nxv2f16(<vscale x 2 x half> %v)
@@ -358,7 +358,7 @@ declare half @llvm.vector.reduce.fmin.nxv4f16(<vscale x 4 x half>)
  
  define half @vreduce_fmin_nxv4f16(<vscale x 4 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv4f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv4f16(<vscale x 4 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv4f16(<vscale x 4 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmin.nxv4f16(<vscale x 4 x half> %v)
@@ -369,7 +369,7 @@ declare half @llvm.vector.reduce.fmin.nxv64f16(<vscale x 64 x half>)
  
  define half @vreduce_fmin_nxv64f16(<vscale x 64 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv64f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv64f16(<vscale x 64 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call half @llvm.vector.reduce.fmin.nxv64f16(<vscale x 64 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmin.nxv64f16(<vscale x 64 x half> %v)
@@ -380,7 +380,7 @@ declare float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float>)
  
  define float @vreduce_fmin_nxv1f32(<vscale x 1 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
@@ -389,7 +389,7 @@ define float @vreduce_fmin_nxv1f32(<vscale x 1 x float> %v) {
  
  define float @vreduce_fmin_nxv1f32_nonans(<vscale x 1 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f32_nonans'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call nnan float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
@@ -398,7 +398,7 @@ define float @vreduce_fmin_nxv1f32_nonans(<vscale x 1 x float> %v) {
  
  define float @vreduce_fmin_nxv1f32_nonans_noinfs(<vscale x 1 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f32_nonans_noinfs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan ninf float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan ninf float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call nnan ninf float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> %v)
@@ -409,7 +409,7 @@ declare float @llvm.vector.reduce.fmin.nxv2f32(<vscale x 2 x float>)
  
  define float @vreduce_fmin_nxv2f32(<vscale x 2 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv2f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv2f32(<vscale x 2 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv2f32(<vscale x 2 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmin.nxv2f32(<vscale x 2 x float> %v)
@@ -420,7 +420,7 @@ declare float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float>)
  
  define float @vreduce_fmin_nxv4f32(<vscale x 4 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv4f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v)
@@ -431,7 +431,7 @@ declare float @llvm.vector.reduce.fmin.nxv32f32(<vscale x 32 x float>)
  
  define float @vreduce_fmin_nxv32f32(<vscale x 32 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv32f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv32f32(<vscale x 32 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %red = call float @llvm.vector.reduce.fmin.nxv32f32(<vscale x 32 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmin.nxv32f32(<vscale x 32 x float> %v)
@@ -442,7 +442,7 @@ declare double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double>)
  
  define double @vreduce_fmin_nxv1f64(<vscale x 1 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
@@ -451,7 +451,7 @@ define double @vreduce_fmin_nxv1f64(<vscale x 1 x double> %v) {
  
  define double @vreduce_fmin_nxv1f64_nonans(<vscale x 1 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f64_nonans'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call nnan double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
@@ -460,7 +460,7 @@ define double @vreduce_fmin_nxv1f64_nonans(<vscale x 1 x double> %v) {
  
  define double @vreduce_fmin_nxv1f64_nonans_noinfs(<vscale x 1 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv1f64_nonans_noinfs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan ninf double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan ninf double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call nnan ninf double @llvm.vector.reduce.fmin.nxv1f64(<vscale x 1 x double> %v)
@@ -471,7 +471,7 @@ declare double @llvm.vector.reduce.fmin.nxv2f64(<vscale x 2 x double>)
  
  define double @vreduce_fmin_nxv2f64(<vscale x 2 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv2f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv2f64(<vscale x 2 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv2f64(<vscale x 2 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmin.nxv2f64(<vscale x 2 x double> %v)
@@ -482,7 +482,7 @@ declare double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double>)
  
  define double @vreduce_fmin_nxv4f64(<vscale x 4 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv4f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v)
@@ -493,7 +493,7 @@ declare double @llvm.vector.reduce.fmin.nxv16f64(<vscale x 16 x double>)
  
  define double @vreduce_fmin_nxv16f64(<vscale x 16 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmin_nxv16f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv16f64(<vscale x 16 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %red = call double @llvm.vector.reduce.fmin.nxv16f64(<vscale x 16 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmin.nxv16f64(<vscale x 16 x double> %v)
@@ -504,7 +504,7 @@ declare half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half>)
  
  define half @vreduce_fmax_nxv1f16(<vscale x 1 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
@@ -513,7 +513,7 @@ define half @vreduce_fmax_nxv1f16(<vscale x 1 x half> %v) {
  
  define half @vreduce_fmax_nxv1f16_nonans(<vscale x 1 x half> %v) #0 {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f16_nonans'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call nnan half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
@@ -522,7 +522,7 @@ define half @vreduce_fmax_nxv1f16_nonans(<vscale x 1 x half> %v) #0 {
  
  define half @vreduce_fmax_nxv1f16_nonans_noinfs(<vscale x 1 x half> %v) #1 {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f16_nonans_noinfs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan ninf half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan ninf half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call nnan ninf half @llvm.vector.reduce.fmax.nxv1f16(<vscale x 1 x half> %v)
@@ -533,7 +533,7 @@ declare half @llvm.vector.reduce.fmax.nxv2f16(<vscale x 2 x half>)
  
  define half @vreduce_fmax_nxv2f16(<vscale x 2 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv2f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv2f16(<vscale x 2 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv2f16(<vscale x 2 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmax.nxv2f16(<vscale x 2 x half> %v)
@@ -544,7 +544,7 @@ declare half @llvm.vector.reduce.fmax.nxv4f16(<vscale x 4 x half>)
  
  define half @vreduce_fmax_nxv4f16(<vscale x 4 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv4f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv4f16(<vscale x 4 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv4f16(<vscale x 4 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmax.nxv4f16(<vscale x 4 x half> %v)
@@ -555,7 +555,7 @@ declare half @llvm.vector.reduce.fmax.nxv64f16(<vscale x 64 x half>)
  
  define half @vreduce_fmax_nxv64f16(<vscale x 64 x half> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv64f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv64f16(<vscale x 64 x half> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %red = call half @llvm.vector.reduce.fmax.nxv64f16(<vscale x 64 x half> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret half %red
  ;
    %red = call half @llvm.vector.reduce.fmax.nxv64f16(<vscale x 64 x half> %v)
@@ -566,7 +566,7 @@ declare float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float>)
  
  define float @vreduce_fmax_nxv1f32(<vscale x 1 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
@@ -575,7 +575,7 @@ define float @vreduce_fmax_nxv1f32(<vscale x 1 x float> %v) {
  
  define float @vreduce_fmax_nxv1f32_nonans(<vscale x 1 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f32_nonans'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call nnan float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
@@ -584,7 +584,7 @@ define float @vreduce_fmax_nxv1f32_nonans(<vscale x 1 x float> %v) {
  
  define float @vreduce_fmax_nxv1f32_nonans_noinfs(<vscale x 1 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f32_nonans_noinfs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan ninf float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan ninf float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call nnan ninf float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> %v)
@@ -595,7 +595,7 @@ declare float @llvm.vector.reduce.fmax.nxv2f32(<vscale x 2 x float>)
  
  define float @vreduce_fmax_nxv2f32(<vscale x 2 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv2f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv2f32(<vscale x 2 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv2f32(<vscale x 2 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmax.nxv2f32(<vscale x 2 x float> %v)
@@ -606,7 +606,7 @@ declare float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float>)
  
  define float @vreduce_fmax_nxv4f32(<vscale x 4 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv4f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v)
@@ -617,7 +617,7 @@ declare float @llvm.vector.reduce.fmax.nxv32f32(<vscale x 32 x float>)
  
  define float @vreduce_fmax_nxv32f32(<vscale x 32 x float> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv32f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv32f32(<vscale x 32 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %red = call float @llvm.vector.reduce.fmax.nxv32f32(<vscale x 32 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call float @llvm.vector.reduce.fmax.nxv32f32(<vscale x 32 x float> %v)
@@ -628,7 +628,7 @@ declare double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double>)
  
  define double @vreduce_fmax_nxv1f64(<vscale x 1 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
@@ -637,7 +637,7 @@ define double @vreduce_fmax_nxv1f64(<vscale x 1 x double> %v) {
  
  define double @vreduce_fmax_nxv1f64_nonans(<vscale x 1 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f64_nonans'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call nnan double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
@@ -646,7 +646,7 @@ define double @vreduce_fmax_nxv1f64_nonans(<vscale x 1 x double> %v) {
  
  define double @vreduce_fmax_nxv1f64_nonans_noinfs(<vscale x 1 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv1f64_nonans_noinfs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call nnan ninf double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call nnan ninf double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call nnan ninf double @llvm.vector.reduce.fmax.nxv1f64(<vscale x 1 x double> %v)
@@ -657,7 +657,7 @@ declare double @llvm.vector.reduce.fmax.nxv2f64(<vscale x 2 x double>)
  
  define double @vreduce_fmax_nxv2f64(<vscale x 2 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv2f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv2f64(<vscale x 2 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv2f64(<vscale x 2 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmax.nxv2f64(<vscale x 2 x double> %v)
@@ -668,7 +668,7 @@ declare double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double>)
  
  define double @vreduce_fmax_nxv4f64(<vscale x 4 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv4f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v)
@@ -679,7 +679,7 @@ declare double @llvm.vector.reduce.fmax.nxv16f64(<vscale x 16 x double>)
  
  define double @vreduce_fmax_nxv16f64(<vscale x 16 x double> %v) {
  ; CHECK-LABEL: 'vreduce_fmax_nxv16f64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv16f64(<vscale x 16 x double> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %red = call double @llvm.vector.reduce.fmax.nxv16f64(<vscale x 16 x double> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret double %red
  ;
    %red = call double @llvm.vector.reduce.fmax.nxv16f64(<vscale x 16 x double> %v)
@@ -688,7 +688,7 @@ define double @vreduce_fmax_nxv16f64(<vscale x 16 x double> %v) {
  
  define float @vreduce_nsz_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
  ; CHECK-LABEL: 'vreduce_nsz_fadd_nxv1f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call reassoc nsz float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call reassoc nsz float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %red
  ;
    %red = call reassoc nsz float @llvm.vector.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v)
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll

index 51266e1..6c08a22 100644 (file)
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
@@ -6,7 +6,7 @@ declare i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> %v)
@@ -17,7 +17,7 @@ declare i8 @llvm.vector.reduce.umax.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_umax_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.umax.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.umax.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.umax.nxv1i8(<vscale x 1 x i8> %v)
@@ -28,7 +28,7 @@ declare i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_smax_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> %v)
@@ -39,7 +39,7 @@ declare i8 @llvm.vector.reduce.umin.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_umin_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.umin.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.umin.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.umin.nxv1i8(<vscale x 1 x i8> %v)
@@ -50,7 +50,7 @@ declare i8 @llvm.vector.reduce.smin.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_smin_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.smin.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.smin.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.smin.nxv1i8(<vscale x 1 x i8> %v)
@@ -61,7 +61,7 @@ declare i8 @llvm.vector.reduce.and.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_and_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.and.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.and.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.and.nxv1i8(<vscale x 1 x i8> %v)
@@ -72,7 +72,7 @@ declare i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_or_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> %v)
@@ -83,7 +83,7 @@ declare i8 @llvm.vector.reduce.xor.nxv1i8(<vscale x 1 x i8>)
  
  define signext i8 @vreduce_xor_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv1i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i8 @llvm.vector.reduce.xor.nxv1i8(<vscale x 1 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i8 @llvm.vector.reduce.xor.nxv1i8(<vscale x 1 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.xor.nxv1i8(<vscale x 1 x i8> %v)
@@ -94,7 +94,7 @@ declare i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %v)
@@ -105,7 +105,7 @@ declare i8 @llvm.vector.reduce.umax.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_umax_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.umax.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.umax.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.umax.nxv2i8(<vscale x 2 x i8> %v)
@@ -116,7 +116,7 @@ declare i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_smax_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> %v)
@@ -127,7 +127,7 @@ declare i8 @llvm.vector.reduce.umin.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_umin_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.umin.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.umin.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.umin.nxv2i8(<vscale x 2 x i8> %v)
@@ -138,7 +138,7 @@ declare i8 @llvm.vector.reduce.smin.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_smin_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.smin.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.smin.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.smin.nxv2i8(<vscale x 2 x i8> %v)
@@ -149,7 +149,7 @@ declare i8 @llvm.vector.reduce.and.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_and_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.and.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.and.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.and.nxv2i8(<vscale x 2 x i8> %v)
@@ -160,7 +160,7 @@ declare i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_or_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> %v)
@@ -171,7 +171,7 @@ declare i8 @llvm.vector.reduce.xor.nxv2i8(<vscale x 2 x i8>)
  
  define signext i8 @vreduce_xor_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i8 @llvm.vector.reduce.xor.nxv2i8(<vscale x 2 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i8 @llvm.vector.reduce.xor.nxv2i8(<vscale x 2 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.xor.nxv2i8(<vscale x 2 x i8> %v)
@@ -182,7 +182,7 @@ declare i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %v)
@@ -193,7 +193,7 @@ declare i8 @llvm.vector.reduce.umax.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_umax_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.umax.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.umax.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.umax.nxv4i8(<vscale x 4 x i8> %v)
@@ -204,7 +204,7 @@ declare i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_smax_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> %v)
@@ -215,7 +215,7 @@ declare i8 @llvm.vector.reduce.umin.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_umin_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.umin.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.umin.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.umin.nxv4i8(<vscale x 4 x i8> %v)
@@ -226,7 +226,7 @@ declare i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_smin_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8> %v)
@@ -237,7 +237,7 @@ declare i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_and_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> %v)
@@ -248,7 +248,7 @@ declare i8 @llvm.vector.reduce.or.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_or_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.or.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.or.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.or.nxv4i8(<vscale x 4 x i8> %v)
@@ -259,7 +259,7 @@ declare i8 @llvm.vector.reduce.xor.nxv4i8(<vscale x 4 x i8>)
  
  define signext i8 @vreduce_xor_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i8 @llvm.vector.reduce.xor.nxv4i8(<vscale x 4 x i8> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i8 @llvm.vector.reduce.xor.nxv4i8(<vscale x 4 x i8> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %red
  ;
    %red = call i8 @llvm.vector.reduce.xor.nxv4i8(<vscale x 4 x i8> %v)
@@ -270,7 +270,7 @@ declare i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %v)
@@ -280,7 +280,7 @@ define signext i16 @vreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
  define signext i16 @vwreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv1i8'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
@@ -291,7 +291,7 @@ define signext i16 @vwreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
  define signext i16 @vwreduce_uadd_nxv1i8(<vscale x 1 x i8> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv1i8'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
@@ -303,7 +303,7 @@ declare i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_umax_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16> %v)
@@ -314,7 +314,7 @@ declare i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_smax_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> %v)
@@ -325,7 +325,7 @@ declare i16 @llvm.vector.reduce.umin.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_umin_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.umin.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.umin.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.umin.nxv1i16(<vscale x 1 x i16> %v)
@@ -336,7 +336,7 @@ declare i16 @llvm.vector.reduce.smin.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_smin_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.smin.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.smin.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.smin.nxv1i16(<vscale x 1 x i16> %v)
@@ -347,7 +347,7 @@ declare i16 @llvm.vector.reduce.and.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_and_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.and.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.and.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.and.nxv1i16(<vscale x 1 x i16> %v)
@@ -358,7 +358,7 @@ declare i16 @llvm.vector.reduce.or.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_or_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.or.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.or.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.or.nxv1i16(<vscale x 1 x i16> %v)
@@ -369,7 +369,7 @@ declare i16 @llvm.vector.reduce.xor.nxv1i16(<vscale x 1 x i16>)
  
  define signext i16 @vreduce_xor_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv1i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i16 @llvm.vector.reduce.xor.nxv1i16(<vscale x 1 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i16 @llvm.vector.reduce.xor.nxv1i16(<vscale x 1 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.xor.nxv1i16(<vscale x 1 x i16> %v)
@@ -380,7 +380,7 @@ declare i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %v)
@@ -390,7 +390,7 @@ define signext i16 @vreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
  define signext i16 @vwreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv2i8'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
@@ -401,7 +401,7 @@ define signext i16 @vwreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
  define signext i16 @vwreduce_uadd_nxv2i8(<vscale x 2 x i8> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv2i8'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
@@ -413,7 +413,7 @@ declare i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_umax_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16> %v)
@@ -424,7 +424,7 @@ declare i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_smax_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> %v)
@@ -435,7 +435,7 @@ declare i16 @llvm.vector.reduce.umin.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_umin_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.umin.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.umin.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.umin.nxv2i16(<vscale x 2 x i16> %v)
@@ -446,7 +446,7 @@ declare i16 @llvm.vector.reduce.smin.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_smin_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.smin.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.smin.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.smin.nxv2i16(<vscale x 2 x i16> %v)
@@ -457,7 +457,7 @@ declare i16 @llvm.vector.reduce.and.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_and_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.and.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.and.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.and.nxv2i16(<vscale x 2 x i16> %v)
@@ -468,7 +468,7 @@ declare i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_or_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.or.nxv2i16(<vscale x 2 x i16> %v)
@@ -479,7 +479,7 @@ declare i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16>)
  
  define signext i16 @vreduce_xor_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %v)
@@ -490,7 +490,7 @@ declare i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %v)
@@ -500,7 +500,7 @@ define signext i16 @vreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
  define signext i16 @vwreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv4i8'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
@@ -511,7 +511,7 @@ define signext i16 @vwreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
  define signext i16 @vwreduce_uadd_nxv4i8(<vscale x 4 x i8> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv4i8'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
@@ -523,7 +523,7 @@ declare i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_umax_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16> %v)
@@ -534,7 +534,7 @@ declare i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_smax_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> %v)
@@ -545,7 +545,7 @@ declare i16 @llvm.vector.reduce.umin.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_umin_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.umin.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.umin.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.umin.nxv4i16(<vscale x 4 x i16> %v)
@@ -556,7 +556,7 @@ declare i16 @llvm.vector.reduce.smin.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_smin_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.smin.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.smin.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.smin.nxv4i16(<vscale x 4 x i16> %v)
@@ -567,7 +567,7 @@ declare i16 @llvm.vector.reduce.and.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_and_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.and.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.and.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.and.nxv4i16(<vscale x 4 x i16> %v)
@@ -578,7 +578,7 @@ declare i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_or_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> %v)
@@ -589,7 +589,7 @@ declare i16 @llvm.vector.reduce.xor.nxv4i16(<vscale x 4 x i16>)
  
  define signext i16 @vreduce_xor_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i16 @llvm.vector.reduce.xor.nxv4i16(<vscale x 4 x i16> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i16 @llvm.vector.reduce.xor.nxv4i16(<vscale x 4 x i16> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %red
  ;
    %red = call i16 @llvm.vector.reduce.xor.nxv4i16(<vscale x 4 x i16> %v)
@@ -600,7 +600,7 @@ declare i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %v)
@@ -610,7 +610,7 @@ define signext i32 @vreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
  define signext i32 @vwreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv1i16'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 1 x i16> %v to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %e = sext <vscale x 1 x i16> %v to <vscale x 1 x i32>
@@ -621,7 +621,7 @@ define signext i32 @vwreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
  define signext i32 @vwreduce_uadd_nxv1i16(<vscale x 1 x i16> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv1i16'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 1 x i16> %v to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %e = zext <vscale x 1 x i16> %v to <vscale x 1 x i32>
@@ -633,7 +633,7 @@ declare i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_umax_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> %v)
@@ -644,7 +644,7 @@ declare i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_smax_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> %v)
@@ -655,7 +655,7 @@ declare i32 @llvm.vector.reduce.umin.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_umin_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.umin.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.umin.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.umin.nxv1i32(<vscale x 1 x i32> %v)
@@ -666,7 +666,7 @@ declare i32 @llvm.vector.reduce.smin.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_smin_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.smin.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.smin.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.smin.nxv1i32(<vscale x 1 x i32> %v)
@@ -677,7 +677,7 @@ declare i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_and_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> %v)
@@ -688,7 +688,7 @@ declare i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_or_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> %v)
@@ -699,7 +699,7 @@ declare i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32>)
  
  define signext i32 @vreduce_xor_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv1i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> %v)
@@ -710,7 +710,7 @@ declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %v)
@@ -720,7 +720,7 @@ define signext i32 @vreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
  define signext i32 @vwreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv2i16'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 2 x i16> %v to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %e = sext <vscale x 2 x i16> %v to <vscale x 2 x i32>
@@ -731,7 +731,7 @@ define signext i32 @vwreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
  define signext i32 @vwreduce_uadd_nxv2i16(<vscale x 2 x i16> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv2i16'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 2 x i16> %v to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %e = zext <vscale x 2 x i16> %v to <vscale x 2 x i32>
@@ -743,7 +743,7 @@ declare i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_umax_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> %v)
@@ -754,7 +754,7 @@ declare i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_smax_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> %v)
@@ -765,7 +765,7 @@ declare i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_umin_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %v)
@@ -776,7 +776,7 @@ declare i32 @llvm.vector.reduce.smin.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_smin_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.smin.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.smin.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.smin.nxv2i32(<vscale x 2 x i32> %v)
@@ -787,7 +787,7 @@ declare i32 @llvm.vector.reduce.and.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_and_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.and.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.and.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.and.nxv2i32(<vscale x 2 x i32> %v)
@@ -798,7 +798,7 @@ declare i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_or_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %v)
@@ -809,7 +809,7 @@ declare i32 @llvm.vector.reduce.xor.nxv2i32(<vscale x 2 x i32>)
  
  define signext i32 @vreduce_xor_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i32 @llvm.vector.reduce.xor.nxv2i32(<vscale x 2 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i32 @llvm.vector.reduce.xor.nxv2i32(<vscale x 2 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.xor.nxv2i32(<vscale x 2 x i32> %v)
@@ -820,7 +820,7 @@ declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v)
@@ -830,7 +830,7 @@ define signext i32 @vreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
  define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv4i16'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
@@ -841,7 +841,7 @@ define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
  define signext i32 @vwreduce_uadd_nxv4i16(<vscale x 4 x i16> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv4i16'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
@@ -853,7 +853,7 @@ declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_umax_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v)
@@ -864,7 +864,7 @@ declare i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_smax_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v)
@@ -875,7 +875,7 @@ declare i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_umin_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v)
@@ -886,7 +886,7 @@ declare i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_smin_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v)
@@ -897,7 +897,7 @@ declare i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_and_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v)
@@ -908,7 +908,7 @@ declare i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_or_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v)
@@ -919,7 +919,7 @@ declare i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32>)
  
  define signext i32 @vreduce_xor_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %red
  ;
    %red = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v)
@@ -930,7 +930,7 @@ declare i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_add_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %v)
@@ -940,7 +940,7 @@ define i64 @vreduce_add_nxv1i64(<vscale x 1 x i64> %v) {
  define i64 @vwreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv1i32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 1 x i32> %v to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %e = sext <vscale x 1 x i32> %v to <vscale x 1 x i64>
@@ -951,7 +951,7 @@ define i64 @vwreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
  define i64 @vwreduce_uadd_nxv1i32(<vscale x 1 x i32> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv1i32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 1 x i32> %v to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %e = zext <vscale x 1 x i32> %v to <vscale x 1 x i64>
@@ -963,7 +963,7 @@ declare i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_umax_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> %v)
@@ -974,7 +974,7 @@ declare i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_smax_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> %v)
@@ -985,7 +985,7 @@ declare i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_umin_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> %v)
@@ -996,7 +996,7 @@ declare i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_smin_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> %v)
@@ -1007,7 +1007,7 @@ declare i64 @llvm.vector.reduce.and.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_and_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.and.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.and.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.and.nxv1i64(<vscale x 1 x i64> %v)
@@ -1018,7 +1018,7 @@ declare i64 @llvm.vector.reduce.or.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_or_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.or.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.or.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.or.nxv1i64(<vscale x 1 x i64> %v)
@@ -1029,7 +1029,7 @@ declare i64 @llvm.vector.reduce.xor.nxv1i64(<vscale x 1 x i64>)
  
  define i64 @vreduce_xor_nxv1i64(<vscale x 1 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv1i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %red = call i64 @llvm.vector.reduce.xor.nxv1i64(<vscale x 1 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %red = call i64 @llvm.vector.reduce.xor.nxv1i64(<vscale x 1 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.xor.nxv1i64(<vscale x 1 x i64> %v)
@@ -1040,7 +1040,7 @@ declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %v)
@@ -1050,7 +1050,7 @@ define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
  define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv2i32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
@@ -1061,7 +1061,7 @@ define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
  define i64 @vwreduce_uadd_nxv2i32(<vscale x 2 x i32> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv2i32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
@@ -1073,7 +1073,7 @@ declare i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_umax_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64> %v)
@@ -1084,7 +1084,7 @@ declare i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_smax_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> %v)
@@ -1095,7 +1095,7 @@ declare i64 @llvm.vector.reduce.umin.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_umin_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.umin.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.umin.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.umin.nxv2i64(<vscale x 2 x i64> %v)
@@ -1106,7 +1106,7 @@ declare i64 @llvm.vector.reduce.smin.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_smin_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.smin.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.smin.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.smin.nxv2i64(<vscale x 2 x i64> %v)
@@ -1117,7 +1117,7 @@ declare i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_and_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> %v)
@@ -1128,7 +1128,7 @@ declare i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_or_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> %v)
@@ -1139,7 +1139,7 @@ declare i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64>)
  
  define i64 @vreduce_xor_nxv2i64(<vscale x 2 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %red = call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> %v)
@@ -1150,7 +1150,7 @@ declare i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_add_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v)
@@ -1160,7 +1160,7 @@ define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
  define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vwreduce_add_nxv4i32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
@@ -1171,7 +1171,7 @@ define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
  define i64 @vwreduce_uadd_nxv4i32(<vscale x 4 x i32> %v) {
  ; CHECK-LABEL: 'vwreduce_uadd_nxv4i32'
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
@@ -1183,7 +1183,7 @@ declare i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_umax_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_umax_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v)
@@ -1194,7 +1194,7 @@ declare i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_smax_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_smax_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v)
@@ -1205,7 +1205,7 @@ declare i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_umin_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_umin_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v)
@@ -1216,7 +1216,7 @@ declare i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_smin_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_smin_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v)
@@ -1227,7 +1227,7 @@ declare i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_and_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_and_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v)
@@ -1238,7 +1238,7 @@ declare i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_or_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_or_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v)
@@ -1249,7 +1249,7 @@ declare i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64>)
  
  define i64 @vreduce_xor_nxv4i64(<vscale x 4 x i64> %v) {
  ; CHECK-LABEL: 'vreduce_xor_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %red = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v)
  ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %red
  ;
    %red = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v)
diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll

index b1d9092..d41f881 100644 (file)
--- a/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
@@ -5,91 +5,91 @@
  
  define void @masked_gather_aligned() {
  ; GENERIC-LABEL: 'masked_gather_aligned'
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0f32(<vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0f32(<vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32768 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0f16(<vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0f16(<vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0f16(<vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0f16(<vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0i64(<vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0i32(<vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32768 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0i16(<vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0i16(<vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 65536 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0i8(<vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32768 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0i8(<vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0i8(<vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0i8(<vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: %V8PTR = call <vscale x 8 x i8*> @llvm.masked.gather.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8*> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: %V4PTR = call <vscale x 4 x i8*> @llvm.masked.gather.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8*> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %V2PTR = call <vscale x 2 x i8*> @llvm.masked.gather.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8*> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %V1PTR = call <vscale x 1 x i8*> @llvm.masked.gather.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i8*> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0f32(<vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0f32(<vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0f16(<vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0f16(<vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0f16(<vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0f16(<vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0i64(<vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0i32(<vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0i16(<vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0i16(<vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0i8(<vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0i8(<vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0i8(<vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0i8(<vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8PTR = call <vscale x 8 x i8*> @llvm.masked.gather.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8*> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4PTR = call <vscale x 4 x i8*> @llvm.masked.gather.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8*> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2PTR = call <vscale x 2 x i8*> @llvm.masked.gather.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8*> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1PTR = call <vscale x 1 x i8*> @llvm.masked.gather.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i8*> undef)
  ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
  ;
  ; MAX256-LABEL: 'masked_gather_aligned'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0f32(<vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0f32(<vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0f16(<vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0f16(<vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0f16(<vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0f16(<vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0i64(<vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0i32(<vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0i16(<vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0i16(<vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0i8(<vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0i8(<vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0i8(<vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0i8(<vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8PTR = call <vscale x 8 x i8*> @llvm.masked.gather.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8*> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4PTR = call <vscale x 4 x i8*> @llvm.masked.gather.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8*> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2PTR = call <vscale x 2 x i8*> @llvm.masked.gather.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8*> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1PTR = call <vscale x 1 x i8*> @llvm.masked.gather.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i8*> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0f32(<vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0f32(<vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0f16(<vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0f16(<vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0f16(<vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0f16(<vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0i64(<vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0i32(<vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0i16(<vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0i16(<vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0i8(<vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0i8(<vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0i8(<vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0i8(<vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8PTR = call <vscale x 8 x i8*> @llvm.masked.gather.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8*> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4PTR = call <vscale x 4 x i8*> @llvm.masked.gather.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8*> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2PTR = call <vscale x 2 x i8*> @llvm.masked.gather.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8*> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1PTR = call <vscale x 1 x i8*> @llvm.masked.gather.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i8*> undef)
  ; MAX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
  ;
  ; UNSUPPORTED-LABEL: 'masked_gather_aligned'
diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll

index 7991aad..c993e20 100644 (file)
--- a/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
@@ -5,91 +5,91 @@
  
  define void @masked_scatter_aligned() {
  ; GENERIC-LABEL: 'masked_scatter_aligned'
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0f64(<vscale x 8 x double> undef, <vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> undef, <vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0f32(<vscale x 16 x float> undef, <vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0f32(<vscale x 1 x float> undef, <vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32768 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0f16(<vscale x 32 x half> undef, <vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0f16(<vscale x 16 x half> undef, <vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0f16(<vscale x 8 x half> undef, <vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0f16(<vscale x 4 x half> undef, <vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0f16(<vscale x 2 x half> undef, <vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0f16(<vscale x 1 x half> undef, <vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0i64(<vscale x 8 x i64> undef, <vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64(<vscale x 4 x i64> undef, <vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0i64(<vscale x 2 x i64> undef, <vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0i32(<vscale x 16 x i32> undef, <vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0i32(<vscale x 2 x i32> undef, <vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0i32(<vscale x 1 x i32> undef, <vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32768 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0i16(<vscale x 32 x i16> undef, <vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0i16(<vscale x 2 x i16> undef, <vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0i16(<vscale x 1 x i16> undef, <vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 65536 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0i8(<vscale x 64 x i8> undef, <vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32768 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0i8(<vscale x 32 x i8> undef, <vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16384 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0i8(<vscale x 16 x i8> undef, <vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0i8(<vscale x 8 x i8> undef, <vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0i8(<vscale x 4 x i8> undef, <vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0i8(<vscale x 2 x i8> undef, <vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0i8(<vscale x 1 x i8> undef, <vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8192 for instruction: call void @llvm.masked.scatter.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8*> undef, <vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4096 for instruction: call void @llvm.masked.scatter.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8*> undef, <vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: call void @llvm.masked.scatter.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8*> undef, <vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef)
-; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: call void @llvm.masked.scatter.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8*> undef, <vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0f64(<vscale x 8 x double> undef, <vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> undef, <vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0f32(<vscale x 16 x float> undef, <vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0f32(<vscale x 1 x float> undef, <vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0f16(<vscale x 32 x half> undef, <vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0f16(<vscale x 16 x half> undef, <vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0f16(<vscale x 8 x half> undef, <vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0f16(<vscale x 4 x half> undef, <vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0f16(<vscale x 2 x half> undef, <vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0f16(<vscale x 1 x half> undef, <vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0i64(<vscale x 8 x i64> undef, <vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64(<vscale x 4 x i64> undef, <vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0i64(<vscale x 2 x i64> undef, <vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0i32(<vscale x 16 x i32> undef, <vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0i32(<vscale x 2 x i32> undef, <vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0i32(<vscale x 1 x i32> undef, <vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0i16(<vscale x 32 x i16> undef, <vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0i16(<vscale x 2 x i16> undef, <vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0i16(<vscale x 1 x i16> undef, <vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0i8(<vscale x 64 x i8> undef, <vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0i8(<vscale x 32 x i8> undef, <vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0i8(<vscale x 16 x i8> undef, <vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0i8(<vscale x 8 x i8> undef, <vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0i8(<vscale x 4 x i8> undef, <vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0i8(<vscale x 2 x i8> undef, <vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0i8(<vscale x 1 x i8> undef, <vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8*> undef, <vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8*> undef, <vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8*> undef, <vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef)
+; GENERIC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8*> undef, <vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef)
  ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
  ;
  ; MAX256-LABEL: 'masked_scatter_aligned'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0f64(<vscale x 8 x double> undef, <vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> undef, <vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0f32(<vscale x 16 x float> undef, <vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0f32(<vscale x 1 x float> undef, <vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0f16(<vscale x 32 x half> undef, <vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0f16(<vscale x 16 x half> undef, <vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0f16(<vscale x 8 x half> undef, <vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0f16(<vscale x 4 x half> undef, <vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0f16(<vscale x 2 x half> undef, <vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0f16(<vscale x 1 x half> undef, <vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0i64(<vscale x 8 x i64> undef, <vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64(<vscale x 4 x i64> undef, <vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0i64(<vscale x 2 x i64> undef, <vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0i32(<vscale x 16 x i32> undef, <vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0i32(<vscale x 2 x i32> undef, <vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0i32(<vscale x 1 x i32> undef, <vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0i16(<vscale x 32 x i16> undef, <vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0i16(<vscale x 2 x i16> undef, <vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0i16(<vscale x 1 x i16> undef, <vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0i8(<vscale x 64 x i8> undef, <vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0i8(<vscale x 32 x i8> undef, <vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0i8(<vscale x 16 x i8> undef, <vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0i8(<vscale x 8 x i8> undef, <vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0i8(<vscale x 4 x i8> undef, <vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0i8(<vscale x 2 x i8> undef, <vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0i8(<vscale x 1 x i8> undef, <vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8*> undef, <vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8*> undef, <vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8*> undef, <vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8*> undef, <vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0f64(<vscale x 8 x double> undef, <vscale x 8 x double*> undef, i32 8, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 8, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 8, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> undef, <vscale x 1 x double*> undef, i32 8, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0f32(<vscale x 16 x float> undef, <vscale x 16 x float*> undef, i32 4, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 4, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 4, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 4, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0f32(<vscale x 1 x float> undef, <vscale x 1 x float*> undef, i32 4, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0f16(<vscale x 32 x half> undef, <vscale x 32 x half*> undef, i32 2, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0f16(<vscale x 16 x half> undef, <vscale x 16 x half*> undef, i32 2, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0f16(<vscale x 8 x half> undef, <vscale x 8 x half*> undef, i32 2, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0f16(<vscale x 4 x half> undef, <vscale x 4 x half*> undef, i32 2, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0f16(<vscale x 2 x half> undef, <vscale x 2 x half*> undef, i32 2, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0f16(<vscale x 1 x half> undef, <vscale x 1 x half*> undef, i32 2, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0i64(<vscale x 8 x i64> undef, <vscale x 8 x i64*> undef, i32 8, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64(<vscale x 4 x i64> undef, <vscale x 4 x i64*> undef, i32 8, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0i64(<vscale x 2 x i64> undef, <vscale x 2 x i64*> undef, i32 8, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 8, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0i32(<vscale x 16 x i32> undef, <vscale x 16 x i32*> undef, i32 4, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 4, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 4, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0i32(<vscale x 2 x i32> undef, <vscale x 2 x i32*> undef, i32 4, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0i32(<vscale x 1 x i32> undef, <vscale x 1 x i32*> undef, i32 4, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0i16(<vscale x 32 x i16> undef, <vscale x 32 x i16*> undef, i32 2, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 2, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 2, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 2, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0i16(<vscale x 2 x i16> undef, <vscale x 2 x i16*> undef, i32 2, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0i16(<vscale x 1 x i16> undef, <vscale x 1 x i16*> undef, i32 2, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0i8(<vscale x 64 x i8> undef, <vscale x 64 x i8*> undef, i32 1, <vscale x 64 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0i8(<vscale x 32 x i8> undef, <vscale x 32 x i8*> undef, i32 1, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0i8(<vscale x 16 x i8> undef, <vscale x 16 x i8*> undef, i32 1, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0i8(<vscale x 8 x i8> undef, <vscale x 8 x i8*> undef, i32 1, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0i8(<vscale x 4 x i8> undef, <vscale x 4 x i8*> undef, i32 1, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0i8(<vscale x 2 x i8> undef, <vscale x 2 x i8*> undef, i32 1, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0i8(<vscale x 1 x i8> undef, <vscale x 1 x i8*> undef, i32 1, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8p0i8.nxv8p0p0i8(<vscale x 8 x i8*> undef, <vscale x 8 x i8**> undef, i32 8, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4p0i8.nxv4p0p0i8(<vscale x 4 x i8*> undef, <vscale x 4 x i8**> undef, i32 8, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2p0i8.nxv2p0p0i8(<vscale x 2 x i8*> undef, <vscale x 2 x i8**> undef, i32 8, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1p0i8.nxv1p0p0i8(<vscale x 1 x i8*> undef, <vscale x 1 x i8**> undef, i32 8, <vscale x 1 x i1> undef)
  ; MAX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
  ;
  ; UNSUPPORTED-LABEL: 'masked_scatter_aligned'
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll

index 05c270c..39bde12 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -19,7 +19,10 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; RV32-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
  ; RV32-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
  ; RV32-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
-; RV32-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]])
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
  ; RV32:       vector.memcheck:
  ; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
  ; RV32-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
@@ -36,42 +39,55 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
  ; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
  ; RV32:       vector.ph:
+; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
+; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
+; RV32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; RV32-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; RV32-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 16, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; RV32-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP7:%.*]] = mul i64 16, [[TMP6]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i32 0
+; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
  ; RV32-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; RV32:       vector.body:
  ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 16, i64 32, i64 48>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <4 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !alias.scope !0
-; RV32-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100>
-; RV32-NEXT:    [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; RV32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <4 x i64> [[TMP2]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP3]], i32 8, <4 x i1> [[TMP1]], <4 x double> undef), !alias.scope !3
-; RV32-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[WIDE_MASKED_GATHER]] to <4 x double>
-; RV32-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]]
-; RV32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <4 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP5]], <4 x double*> [[TMP6]], i32 8, <4 x i1> [[TMP1]]), !alias.scope !5, !noalias !7
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV32-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 64, i64 64, i64 64, i64 64>
-; RV32-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
-; RV32-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> undef), !alias.scope !0
+; RV32-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 1 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 100, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], <vscale x 1 x i64> [[TMP10]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> undef), !alias.scope !3
+; RV32-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 1 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 1 x double>
+; RV32-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER12]], [[TMP12]]
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[A]], <vscale x 1 x i64> [[VEC_IND]]
+; RV32-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> [[TMP13]], <vscale x 1 x double*> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; RV32-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; RV32-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
  ; RV32:       middle.block:
-; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, 624
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]]
  ; RV32-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; RV32:       scalar.ph:
-; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
  ; RV32-NEXT:    br label [[FOR_BODY:%.*]]
  ; RV32:       for.body:
  ; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
  ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
+; RV32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
  ; RV32-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
  ; RV32:       if.then:
-; RV32-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
-; RV32-NEXT:    [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; RV32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
-; RV32-NEXT:    [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
+; RV32-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP18]]
+; RV32-NEXT:    [[TMP19:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; RV32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP17]] to double
+; RV32-NEXT:    [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]]
  ; RV32-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
  ; RV32-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
  ; RV32-NEXT:    br label [[FOR_INC]]
@@ -87,7 +103,10 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; RV64-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
  ; RV64-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
  ; RV64-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
-; RV64-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]])
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
  ; RV64:       vector.memcheck:
  ; RV64-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
  ; RV64-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
@@ -104,42 +123,55 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; RV64-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
  ; RV64-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
  ; RV64:       vector.ph:
+; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
+; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
+; RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; RV64-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; RV64-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 16, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; RV64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP7:%.*]] = mul i64 16, [[TMP6]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i32 0
+; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
  ; RV64-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; RV64:       vector.body:
  ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 16, i64 32, i64 48>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <4 x i64> [[VEC_IND]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !alias.scope !0
-; RV64-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100>
-; RV64-NEXT:    [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; RV64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <4 x i64> [[TMP2]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP3]], i32 8, <4 x i1> [[TMP1]], <4 x double> undef), !alias.scope !3
-; RV64-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[WIDE_MASKED_GATHER]] to <4 x double>
-; RV64-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]]
-; RV64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <4 x i64> [[VEC_IND]]
-; RV64-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP5]], <4 x double*> [[TMP6]], i32 8, <4 x i1> [[TMP1]]), !alias.scope !5, !noalias !7
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV64-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 64, i64 64, i64 64, i64 64>
-; RV64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
-; RV64-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> undef), !alias.scope !0
+; RV64-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 1 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 100, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], <vscale x 1 x i64> [[TMP10]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> undef), !alias.scope !3
+; RV64-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 1 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 1 x double>
+; RV64-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER12]], [[TMP12]]
+; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[A]], <vscale x 1 x i64> [[VEC_IND]]
+; RV64-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> [[TMP13]], <vscale x 1 x double*> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; RV64-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; RV64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
  ; RV64:       middle.block:
-; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, 624
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]]
  ; RV64-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; RV64:       scalar.ph:
-; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
  ; RV64-NEXT:    br label [[FOR_BODY:%.*]]
  ; RV64:       for.body:
  ; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
  ; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; RV64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
+; RV64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
  ; RV64-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
  ; RV64:       if.then:
-; RV64-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
-; RV64-NEXT:    [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; RV64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
-; RV64-NEXT:    [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
+; RV64-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP18]]
+; RV64-NEXT:    [[TMP19:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; RV64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP17]] to double
+; RV64-NEXT:    [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]]
  ; RV64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
  ; RV64-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
  ; RV64-NEXT:    br label [[FOR_INC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll

index c8502d6..b0c1f52 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -320,50 +320,75 @@ for.end:
  define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
  ; VLENUNK-LABEL: @indexed_store(
  ; VLENUNK-NEXT:  entry:
+; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VLENUNK:       vector.ph:
+; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VLENUNK:       vector.body:
+; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
+; VLENUNK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; VLENUNK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; VLENUNK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VLENUNK:       middle.block:
+; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; VLENUNK:       scalar.ph:
+; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
  ; VLENUNK:       for.body:
-; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
+; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
  ; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
-; VLENUNK-NEXT:    store i64 [[V:%.*]], ptr [[AADDR]], align 8
+; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
+; VLENUNK-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
  ; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
  ; VLENUNK:       for.end:
  ; VLENUNK-NEXT:    ret void
  ;
  ; VLEN128-LABEL: @indexed_store(
  ; VLEN128-NEXT:  entry:
-; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
  ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; VLEN128:       vector.body:
  ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP0]]
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; VLEN128-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <2 x i64> [[WIDE_LOAD]]
-; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], <2 x i64> [[WIDE_LOAD1]]
-; VLEN128-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>)
-; VLEN128-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT3]], <2 x ptr> [[TMP7]], i32 8, <2 x i1> <i1 true, i1 true>)
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
+; VLEN128-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; VLEN128-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; VLEN128-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
  ; VLEN128:       middle.block:
-; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; VLEN128:       scalar.ph:
-; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
  ; VLEN128:       for.body:
  ; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -397,56 +422,82 @@ for.end:
  define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
  ; VLENUNK-LABEL: @indexed_load(
  ; VLENUNK-NEXT:  entry:
+; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VLENUNK:       vector.ph:
+; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VLENUNK:       vector.body:
+; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VLENUNK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
+; VLENUNK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> undef)
+; VLENUNK-NEXT:    [[TMP6]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; VLENUNK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLENUNK:       middle.block:
+; VLENUNK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP6]])
+; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; VLENUNK:       scalar.ph:
+; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VLENUNK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
  ; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
  ; VLENUNK:       for.body:
-; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VLENUNK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
-; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
+; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; VLENUNK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
+; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
  ; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
+; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
  ; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
  ; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; VLENUNK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
  ; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
  ; VLENUNK:       for.end:
-; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ]
+; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
  ; VLENUNK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
  ;
  ; VLEN128-LABEL: @indexed_load(
  ; VLEN128-NEXT:  entry:
-; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  ; VLEN128:       vector.ph:
+; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
  ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; VLEN128:       vector.body:
  ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP0]]
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; VLEN128-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <2 x i64> [[WIDE_LOAD]]
-; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], <2 x i64> [[WIDE_LOAD2]]
-; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> undef)
-; VLEN128-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[TMP7]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> undef)
-; VLEN128-NEXT:    [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; VLEN128-NEXT:    [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[WIDE_MASKED_GATHER3]]
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
+; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> undef)
+; VLEN128-NEXT:    [[TMP6]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
  ; VLEN128:       middle.block:
-; VLEN128-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; VLEN128-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; VLEN128-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP6]])
+; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; VLEN128:       scalar.ph:
-; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
  ; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
  ; VLEN128:       for.body:
  ; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -460,7 +511,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
  ; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
  ; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
  ; VLEN128:       for.end:
-; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
  ; VLEN128-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
  ;
  entry:
@@ -504,7 +555,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
  ; VLENUNK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
  ; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
  ; VLENUNK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VLENUNK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
  ; VLENUNK:       middle.block:
  ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -517,7 +568,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
  ; VLENUNK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
  ; VLENUNK:       for.end:
  ; VLENUNK-NEXT:    ret void
  ;
@@ -596,7 +647,7 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
  ; VLENUNK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
  ; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
  ; VLENUNK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLENUNK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
  ; VLENUNK:       middle.block:
  ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -609,7 +660,7 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
  ; VLENUNK-NEXT:    store ptr [[V]], ptr [[ARRAYIDX]], align 8
  ; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
  ; VLENUNK:       for.end:
  ; VLENUNK-NEXT:    ret void
  ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll

index 41976ac..2abd86f 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -74,16 +74,46 @@ for.end:
  define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
  ; CHECK-LABEL: @indexed_store(
  ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[TMP7]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[TMP8]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
  ; CHECK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
-; CHECK-NEXT:    store i64 [[V:%.*]], ptr [[AADDR]], align 8
+; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
+; CHECK-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
  ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    ret void
  ;
@@ -107,20 +137,53 @@ for.end:
  define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
  ; CHECK-LABEL: @indexed_load(
  ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[TMP7]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP8]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> undef)
+; CHECK-NEXT:    [[TMP9]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> [[TMP9]], <vscale x 1 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP10]])
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
  ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
  ; CHECK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
+; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
  ; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
  ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
  ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
  ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
  ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
  ;
  entry:
@@ -168,7 +231,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
  ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
  ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
  ; CHECK:       middle.block:
  ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; CHECK:       scalar.ph:
@@ -180,7 +243,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
  ; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    ret void
  ;
@@ -226,7 +289,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
  ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
  ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
  ; CHECK:       middle.block:
  ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; CHECK:       scalar.ph:
@@ -239,7 +302,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
  ; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    ret void
  ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll

index 2b036a5..86d7177 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -338,21 +338,60 @@ for.end:
  define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
  ; SCALABLE-LABEL: @conditional_uniform_load(
  ; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> [[TMP2]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP5]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i32 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> [[TMP8]], <vscale x 1 x i64> undef)
+; SCALABLE-NEXT:    [[TMP9:%.*]] = xor <vscale x 1 x i1> [[TMP8]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP8]], <vscale x 1 x i64> [[WIDE_MASKED_GATHER]], <vscale x 1 x i64> zeroinitializer
+; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; SCALABLE-NEXT:    store <vscale x 1 x i64> [[PREDPHI]], ptr [[TMP11]], align 8
+; SCALABLE-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
  ; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
  ; SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
  ; SCALABLE-NEXT:    br i1 [[CMP]], label [[DO_LOAD:%.*]], label [[LATCH]]
  ; SCALABLE:       do_load:
-; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
+; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
  ; SCALABLE-NEXT:    br label [[LATCH]]
  ; SCALABLE:       latch:
  ; SCALABLE-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[V]], [[DO_LOAD]] ]
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
  ; SCALABLE-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
  ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
  ; SCALABLE:       for.end:
  ; SCALABLE-NEXT:    ret void
  ;
@@ -414,21 +453,66 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
  ;
  ; TF-SCALABLE-LABEL: @conditional_uniform_load(
  ; TF-SCALABLE-NEXT:  entry:
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; TF-SCALABLE:       vector.ph:
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = add <vscale x 1 x i64> [[TMP5]], zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 1 x i64> [[TMP6]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP9]], i32 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TF-SCALABLE:       vector.body:
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP10]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i64> undef)
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i64> [[WIDE_MASKED_GATHER]], <vscale x 1 x i64> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = or <vscale x 1 x i1> [[TMP12]], [[TMP14]]
+; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[PREDPHI]], ptr [[TMP17]], i32 8, <vscale x 1 x i1> [[TMP16]])
+; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE:       middle.block:
+; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; TF-SCALABLE:       scalar.ph:
+; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; TF-SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
  ; TF-SCALABLE:       for.body:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
  ; TF-SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
  ; TF-SCALABLE-NEXT:    br i1 [[CMP]], label [[DO_LOAD:%.*]], label [[LATCH]]
  ; TF-SCALABLE:       do_load:
-; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
+; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
  ; TF-SCALABLE-NEXT:    br label [[LATCH]]
  ; TF-SCALABLE:       latch:
  ; TF-SCALABLE-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[V]], [[DO_LOAD]] ]
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
+; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
  ; TF-SCALABLE-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
  ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
  ; TF-SCALABLE:       for.end:
  ; TF-SCALABLE-NEXT:    ret void
  ;
@@ -523,7 +607,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
  ; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
  ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
  ; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
  ; SCALABLE:       middle.block:
  ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -537,7 +621,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
  ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
  ; SCALABLE:       for.end:
  ; SCALABLE-NEXT:    ret void
  ;
@@ -608,7 +692,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
  ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
  ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
  ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
  ; TF-SCALABLE:       middle.block:
  ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; TF-SCALABLE:       scalar.ph:
@@ -621,7 +705,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
  ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
  ; TF-SCALABLE:       for.end:
  ; TF-SCALABLE-NEXT:    ret void
  ;
@@ -698,7 +782,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
  ; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
  ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
  ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
  ; SCALABLE:       middle.block:
  ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -712,7 +796,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
  ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
  ; SCALABLE:       for.end:
  ; SCALABLE-NEXT:    ret void
  ;
@@ -783,7 +867,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
  ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
  ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
  ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
  ; TF-SCALABLE:       middle.block:
  ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; TF-SCALABLE:       scalar.ph:
@@ -796,7 +880,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
  ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
  ; TF-SCALABLE:       for.end:
  ; TF-SCALABLE-NEXT:    ret void
  ;
@@ -853,15 +937,53 @@ for.end:
  define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
  ; SCALABLE-LABEL: @uniform_store_of_loop_varying(
  ; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> [[TMP2]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP5]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i32 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VEC_IND]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP9]], align 8
+; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
  ; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    store i64 [[IV]], ptr [[B:%.*]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V:%.*]], ptr [[ARRAYIDX]], align 8
+; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    store i64 [[IV]], ptr [[B]], align 8
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
  ; SCALABLE:       for.end:
  ; SCALABLE-NEXT:    ret void
  ;
@@ -912,15 +1034,56 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
  ;
  ; TF-SCALABLE-LABEL: @uniform_store_of_loop_varying(
  ; TF-SCALABLE-NEXT:  entry:
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; TF-SCALABLE:       vector.ph:
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = add <vscale x 1 x i64> [[TMP5]], zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 1 x i64> [[TMP6]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP9]], i32 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TF-SCALABLE:       vector.body:
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP10]], i64 1024)
+; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VEC_IND]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP12]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
+; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE:       middle.block:
+; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; TF-SCALABLE:       scalar.ph:
+; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; TF-SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
  ; TF-SCALABLE:       for.body:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    store i64 [[IV]], ptr [[B:%.*]], align 8
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V:%.*]], ptr [[ARRAYIDX]], align 8
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; TF-SCALABLE-NEXT:    store i64 [[IV]], ptr [[B]], align 8
+; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
  ; TF-SCALABLE:       for.end:
  ; TF-SCALABLE-NEXT:    ret void
  ;
@@ -979,20 +1142,59 @@ for.end:
  define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
  ; SCALABLE-LABEL: @conditional_uniform_store(
  ; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> [[TMP2]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP5]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i32 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 1 x i1> [[TMP8]])
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
+; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8
+; SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
  ; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
  ; SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
  ; SCALABLE-NEXT:    br i1 [[CMP]], label [[DO_STORE:%.*]], label [[LATCH]]
  ; SCALABLE:       do_store:
-; SCALABLE-NEXT:    store i64 [[V:%.*]], ptr [[B:%.*]], align 8
+; SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
  ; SCALABLE-NEXT:    br label [[LATCH]]
  ; SCALABLE:       latch:
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
  ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
  ; SCALABLE:       for.end:
  ; SCALABLE-NEXT:    ret void
  ;
@@ -1053,20 +1255,66 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
  ;
  ; TF-SCALABLE-LABEL: @conditional_uniform_store(
  ; TF-SCALABLE-NEXT:  entry:
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; TF-SCALABLE:       vector.ph:
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = add <vscale x 1 x i64> [[TMP5]], zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 1 x i64> [[TMP6]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP9]], i32 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TF-SCALABLE:       vector.body:
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP10]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 1 x i1> [[TMP12]])
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP14]], <vscale x 1 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = or <vscale x 1 x i1> [[TMP12]], [[TMP15]]
+; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP17]], i32 8, <vscale x 1 x i1> [[TMP16]])
+; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-SCALABLE:       middle.block:
+; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; TF-SCALABLE:       scalar.ph:
+; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; TF-SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
  ; TF-SCALABLE:       for.body:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
  ; TF-SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
  ; TF-SCALABLE-NEXT:    br i1 [[CMP]], label [[DO_STORE:%.*]], label [[LATCH]]
  ; TF-SCALABLE:       do_store:
-; TF-SCALABLE-NEXT:    store i64 [[V:%.*]], ptr [[B:%.*]], align 8
+; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
  ; TF-SCALABLE-NEXT:    br label [[LATCH]]
  ; TF-SCALABLE:       latch:
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
+; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
  ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
  ; TF-SCALABLE:       for.end:
  ; TF-SCALABLE-NEXT:    ret void
  ;
@@ -1159,7 +1407,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
  ; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
  ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
  ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
  ; SCALABLE:       middle.block:
  ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
  ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1173,7 +1421,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
  ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
  ; SCALABLE:       for.end:
  ; SCALABLE-NEXT:    ret void
  ;
@@ -1244,7 +1492,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
  ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
  ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
  ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
  ; TF-SCALABLE:       middle.block:
  ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  ; TF-SCALABLE:       scalar.ph:
@@ -1257,7 +1505,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
  ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
  ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
  ; TF-SCALABLE:       for.end:
  ; TF-SCALABLE-NEXT:    ret void
  ;
author	Philip Reames <preames@rivosinc.com>
	Thu, 18 Aug 2022 19:56:54 +0000 (12:56 -0700)
committer	Philip Reames <listmail@philipreames.com>
	Thu, 18 Aug 2022 20:10:03 +0000 (13:10 -0700)
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h		patch \| blob \| history
llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll		patch \| blob \| history
llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll		patch \| blob \| history
llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll		patch \| blob \| history
llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll		patch \| blob \| history