From: Pierre-vh <pierre.vanhoutryve@arm.com>
Date: Tue, 5 May 2020 13:25:23 +0000 (+0100)
Subject: [LSR][ARM] Add new TTI hook to mark some LSR chains as profitable
X-Git-Tag: llvmorg-12-init~6190
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2668775f66656f719f7d8164066ec5ca64d707f2;p=platform%2Fupstream%2Fllvm.git

[LSR][ARM] Add new TTI hook to mark some LSR chains as profitable

This patch adds a new TTI hook to allow targets to tell LSR that
a chain including some instruction is already profitable and
should not be optimized. This patch also adds an implementation
of this TTI hook for ARM so LSR doesn't optimize chains that include
the VCTP intrinsic.

Differential Revision: https://reviews.llvm.org/D79418
---

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f3e5756..e5f2c53c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -519,6 +519,9 @@ public:
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2) const;
 
+  /// \returns true if LSR should not optimize a chain that includes \p I.
+  bool isProfitableLSRChainElement(Instruction *I) const;
+
   /// Return true if the target can fuse a compare and branch.
   /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
   /// calculation for the instructions in a loop.
@@ -1233,6 +1236,7 @@ public:
                                      Instruction *I) = 0;
   virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                              TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
   virtual bool canMacroFuseCmp() = 0;
   virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                           LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
@@ -1542,6 +1546,9 @@ public:
                      TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
+  bool isProfitableLSRChainElement(Instruction *I) override {
+    return Impl.isProfitableLSRChainElement(I);
+  }
   bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); }
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                   DominatorTree *DT, AssumptionCache *AC,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 372a254..669e4b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -166,6 +166,8 @@ public:
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
   }
 
+  bool isProfitableLSRChainElement(Instruction *I) { return false; }
+
   bool canMacroFuseCmp() { return false; }
 
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5a06644..0d972d7 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -262,6 +262,10 @@ public:
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
   }
 
+  bool isProfitableLSRChainElement(Instruction *I) {
+    return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
+  }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 95b17aa..cda3d16 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -261,6 +261,10 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const {
+  return TTIImpl->isProfitableLSRChainElement(I);
+}
+
 bool TargetTransformInfo::canMacroFuseCmp() const {
   return TTIImpl->canMacroFuseCmp();
 }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c90429f..3864e28 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -550,6 +551,23 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
+bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // If a VCTP is part of a chain, it's already profitable and shouldn't be
+    // optimized, else LSR may block tail-predication.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::arm_mve_vctp8:
+    case Intrinsic::arm_mve_vctp16:
+    case Intrinsic::arm_mve_vctp32:
+    case Intrinsic::arm_mve_vctp64:
+      return true;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
     return false;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 9cb4916..d6efc6e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -151,6 +151,8 @@ public:
     return ST->getMaxInterleaveFactor();
   }
 
+  bool isProfitableLSRChainElement(Instruction *I);
+
   bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
 
   bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 652ff6b..d8a272a 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2820,9 +2820,10 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// increments can be computed in fewer registers when chained.
 ///
 /// TODO: Consider IVInc free if it's already used in another chains.
-static bool
-isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
-                  ScalarEvolution &SE) {
+static bool isProfitableChain(IVChain &Chain,
+                              SmallPtrSetImpl<Instruction *> &Users,
+                              ScalarEvolution &SE,
+                              const TargetTransformInfo &TTI) {
   if (StressIVChain)
     return true;
 
@@ -2851,7 +2852,14 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
   unsigned NumConstIncrements = 0;
   unsigned NumVarIncrements = 0;
   unsigned NumReusedIncrements = 0;
+
+  if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
+    return true;
+
   for (const IVInc &Inc : Chain) {
+    if (TTI.isProfitableLSRChainElement(Inc.UserInst))
+      return true;
+
     if (Inc.IncExpr->isZero())
       continue;
 
@@ -3082,7 +3090,7 @@ void LSRInstance::CollectChains() {
   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
        UsersIdx < NChains; ++UsersIdx) {
     if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE))
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
       continue;
     // Preserve the chain at UsesIdx.
     if (ChainIdx != UsersIdx)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
new file mode 100644
index 0000000..bc2c7e0
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-mve-tail-predication=false -mtriple=thumbv8.1m.main -mattr=+mve,+mve.fp %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m-arm-none-eabi"
+
+; Tests that LSR will not interfere with the VCTP intrinsic,
+; and that this loop will correctly become tail-predicated.
+
+define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) {
+; CHECK-LABEL: vctpi32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmvn.i32 q1, #0x1f
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vadd.i32 q1, q3, q1
+; CHECK-NEXT:    subs r3, r1, #1
+; CHECK-NEXT:    vidup.u32 q2, r2, #8
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vadd.i32 q1, q2, r0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:  .LBB0_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [q1, #32]!
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2:
+; CHECK-NEXT:    bl vecAddAcrossF32Mve
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vabs.f32 s0, s0
+; CHECK-NEXT:    pop {r7, pc}
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:
+  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...)
+declare float @llvm.fabs.f32(float)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll
new file mode 100644
index 0000000..08b2438
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll
@@ -0,0 +1,257 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m-arm-none-eabi"
+
+define float @vctp8(float* %0, i32 %1) {
+; CHECK-LABEL: @vctp8(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP12]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12)
+  %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define float @vctp16(float* %0, i32 %1) {
+; CHECK-LABEL: @vctp16(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP12]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12)
+  %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define float @vctpi32(float* %0, i32 %1) {
+; CHECK-LABEL: @vctpi32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+
+define float @vctpi64(float* %0, i32 %1) {
+; CHECK-LABEL: @vctpi64(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare i32 @vecAddAcrossF32Mve(...)
+declare <4 x i1> @v8i1_to_v4i1(<8 x i1>)
+declare <4 x i1> @v16i1_to_v4i1(<16 x i1>)
+declare float @llvm.fabs.f32(float)