From: Bradley Smith <bradley.smith@arm.com>
Date: Mon, 26 Apr 2021 15:19:25 +0000 (+0100)
Subject: [AArch64][SVE] Move convert.{from,to}.svbool optimization into InstCombine
X-Git-Tag: llvmorg-14-init~8093
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c8f20ed44888f3a09c077690480d1d978c881b0d;p=platform%2Fupstream%2Fllvm.git

[AArch64][SVE] Move convert.{from,to}.svbool optimization into InstCombine

As part of this the ptrue coalescing done in SVEIntrinsicOpts has been
modified to not introduce redundant converts, since the convert removal
will no longer run after that optimisation to clean up.

Differential Revision: https://reviews.llvm.org/D101302
---

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 73d96d9..94bc2ee 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -281,6 +281,91 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
+/// The function will remove redundant reinterprets casting in the presence
+/// of the control flow
+static Optional<Instruction *> processPhiNode(InstCombiner &IC,
+                                              IntrinsicInst &II) {
+  SmallVector<Instruction *, 32> Worklist;
+  auto RequiredType = II.getType();
+
+  auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
+  assert(PN && "Expected Phi Node!");
+
+  // Don't create a new Phi unless we can remove the old one.
+  if (!PN->hasOneUse())
+    return None;
+
+  for (Value *IncValPhi : PN->incoming_values()) {
+    auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
+    if (!Reinterpret ||
+        Reinterpret->getIntrinsicID() !=
+            Intrinsic::aarch64_sve_convert_to_svbool ||
+        RequiredType != Reinterpret->getArgOperand(0)->getType())
+      return None;
+  }
+
+  // Create the new Phi
+  LLVMContext &Ctx = PN->getContext();
+  IRBuilder<> Builder(Ctx);
+  Builder.SetInsertPoint(PN);
+  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
+  Worklist.push_back(PN);
+
+  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
+    auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
+    NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
+    Worklist.push_back(Reinterpret);
+  }
+
+  // Cleanup Phi Node and reinterprets
+  return IC.replaceInstUsesWith(II, NPN);
+}
+
+static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
+                                                            IntrinsicInst &II) {
+  // If the reinterpret instruction operand is a PHI Node
+  if (isa<PHINode>(II.getArgOperand(0)))
+    return processPhiNode(IC, II);
+
+  SmallVector<Instruction *, 32> CandidatesForRemoval;
+  Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
+
+  const auto *IVTy = cast<VectorType>(II.getType());
+
+  // Walk the chain of conversions.
+  while (Cursor) {
+    // If the type of the cursor has fewer lanes than the final result, zeroing
+    // must take place, which breaks the equivalence chain.
+    const auto *CursorVTy = cast<VectorType>(Cursor->getType());
+    if (CursorVTy->getElementCount().getKnownMinValue() <
+        IVTy->getElementCount().getKnownMinValue())
+      break;
+
+    // If the cursor has the same type as I, it is a viable replacement.
+    if (Cursor->getType() == IVTy)
+      EarliestReplacement = Cursor;
+
+    auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
+
+    // If this is not an SVE conversion intrinsic, this is the end of the chain.
+    if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_to_svbool ||
+                              IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_from_svbool))
+      break;
+
+    CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
+    Cursor = IntrinsicCursor->getOperand(0);
+  }
+
+  // If no viable replacement in the conversion chain was found, there is
+  // nothing to do.
+  if (!EarliestReplacement)
+    return None;
+
+  return IC.replaceInstUsesWith(II, EarliestReplacement);
+}
+
 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
                                                   IntrinsicInst &II) {
   Value *Pg = II.getArgOperand(0);
@@ -368,6 +453,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   switch (IID) {
   default:
     break;
+  case Intrinsic::aarch64_sve_convert_from_svbool:
+    return instCombineConvertFromSVBool(IC, II);
   case Intrinsic::aarch64_sve_lasta:
   case Intrinsic::aarch64_sve_lastb:
     return instCombineSVELast(IC, II);
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index e907400..2a11556 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -11,18 +11,13 @@
 //
 // This pass performs the following optimizations:
 //
-// - removes unnecessary reinterpret intrinsics
-//   (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
-//     %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
-//     %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-//
 // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
 //     %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 //     %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
 //     ; (%1 can be replaced with a reinterpret of %2)
 //
-// - optimizes ptest intrinsics and phi instructions where the operands are
-//   being needlessly converted to and from svbool_t.
+// - optimizes ptest intrinsics where the operands are being needlessly
+//   converted to and from svbool_t.
 //
 //===----------------------------------------------------------------------===//
 
@@ -75,12 +70,9 @@ private:
   /// the functions themselves.
   bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
 
-  static bool optimizeConvertFromSVBool(IntrinsicInst *I);
   static bool optimizePTest(IntrinsicInst *I);
   static bool optimizeVectorMul(IntrinsicInst *I);
   static bool optimizeTBL(IntrinsicInst *I);
-
-  static bool processPhiNode(IntrinsicInst *I);
 };
 } // end anonymous namespace
 
@@ -197,17 +189,30 @@ bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
       Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
       {MostEncompassingPTrue});
 
+  bool ConvertFromCreated = false;
   for (auto *PTrue : PTrues) {
     auto *PTrueVTy = cast<VectorType>(PTrue->getType());
 
-    Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
-    auto *ConvertFromSVBool =
-        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
-                                {PTrueVTy}, {ConvertToSVBool});
-    PTrue->replaceAllUsesWith(ConvertFromSVBool);
+    // Only create the converts if the types are not already the same, otherwise
+    // just use the most encompassing ptrue.
+    if (MostEncompassingPTrueVTy != PTrueVTy) {
+      ConvertFromCreated = true;
+
+      Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
+      auto *ConvertFromSVBool =
+          Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
+                                  {PTrueVTy}, {ConvertToSVBool});
+      PTrue->replaceAllUsesWith(ConvertFromSVBool);
+    } else
+      PTrue->replaceAllUsesWith(MostEncompassingPTrue);
+
     PTrue->eraseFromParent();
   }
 
+  // We never used the ConvertTo so remove it
+  if (!ConvertFromCreated)
+    ConvertToSVBool->eraseFromParent();
+
   return true;
 }
 
@@ -294,51 +299,6 @@ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
   return Changed;
 }
 
-/// The function will remove redundant reinterprets casting in the presence
-/// of the control flow
-bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
-
-  SmallVector<Instruction *, 32> Worklist;
-  auto RequiredType = X->getType();
-
-  auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
-  assert(PN && "Expected Phi Node!");
-
-  // Don't create a new Phi unless we can remove the old one.
-  if (!PN->hasOneUse())
-    return false;
-
-  for (Value *IncValPhi : PN->incoming_values()) {
-    auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
-    if (!Reinterpret ||
-        RequiredType != Reinterpret->getArgOperand(0)->getType())
-      return false;
-  }
-
-  // Create the new Phi
-  LLVMContext &Ctx = PN->getContext();
-  IRBuilder<> Builder(Ctx);
-  Builder.SetInsertPoint(PN);
-  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
-  Worklist.push_back(PN);
-
-  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
-    auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
-    NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
-    Worklist.push_back(Reinterpret);
-  }
-
-  // Cleanup Phi Node and reinterprets
-  X->replaceAllUsesWith(NPN);
-  X->eraseFromParent();
-
-  for (auto &I : Worklist)
-    if (I->use_empty())
-      I->eraseFromParent();
-
-  return true;
-}
-
 bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
   IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
   IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
@@ -473,69 +433,12 @@ bool SVEIntrinsicOpts::optimizeTBL(IntrinsicInst *I) {
   return true;
 }
 
-bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
-  assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
-         "Unexpected opcode");
-
-  // If the reinterpret instruction operand is a PHI Node
-  if (isa<PHINode>(I->getArgOperand(0)))
-    return processPhiNode(I);
-
-  SmallVector<Instruction *, 32> CandidatesForRemoval;
-  Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
-
-  const auto *IVTy = cast<VectorType>(I->getType());
-
-  // Walk the chain of conversions.
-  while (Cursor) {
-    // If the type of the cursor has fewer lanes than the final result, zeroing
-    // must take place, which breaks the equivalence chain.
-    const auto *CursorVTy = cast<VectorType>(Cursor->getType());
-    if (CursorVTy->getElementCount().getKnownMinValue() <
-        IVTy->getElementCount().getKnownMinValue())
-      break;
-
-    // If the cursor has the same type as I, it is a viable replacement.
-    if (Cursor->getType() == IVTy)
-      EarliestReplacement = Cursor;
-
-    auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
-
-    // If this is not an SVE conversion intrinsic, this is the end of the chain.
-    if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
-                                  Intrinsic::aarch64_sve_convert_to_svbool ||
-                              IntrinsicCursor->getIntrinsicID() ==
-                                  Intrinsic::aarch64_sve_convert_from_svbool))
-      break;
-
-    CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
-    Cursor = IntrinsicCursor->getOperand(0);
-  }
-
-  // If no viable replacement in the conversion chain was found, there is
-  // nothing to do.
-  if (!EarliestReplacement)
-    return false;
-
-  I->replaceAllUsesWith(EarliestReplacement);
-  I->eraseFromParent();
-
-  while (!CandidatesForRemoval.empty()) {
-    Instruction *Candidate = CandidatesForRemoval.pop_back_val();
-    if (Candidate->use_empty())
-      Candidate->eraseFromParent();
-  }
-  return true;
-}
-
 bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
   IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
   if (!IntrI)
     return false;
 
   switch (IntrI->getIntrinsicID()) {
-  case Intrinsic::aarch64_sve_convert_from_svbool:
-    return optimizeConvertFromSVBool(IntrI);
   case Intrinsic::aarch64_sve_fmul:
   case Intrinsic::aarch64_sve_mul:
     return optimizeVectorMul(IntrI);
@@ -591,7 +494,6 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
       continue;
 
     switch (F.getIntrinsicID()) {
-    case Intrinsic::aarch64_sve_convert_from_svbool:
     case Intrinsic::aarch64_sve_ptest_any:
     case Intrinsic::aarch64_sve_ptest_first:
     case Intrinsic::aarch64_sve_ptest_last:
diff --git a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
index 23a956d..bab682c 100644
--- a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
@@ -163,14 +163,13 @@ define <vscale x 4 x i32> @coalesce_test_same_size(i32* %addr) {
 define <vscale x 8 x i16> @coalesce_test_promoted_ptrue(i32* %addr1, i16* %addr2) {
 ; CHECK-LABEL: @coalesce_test_promoted_ptrue(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], i32* [[ADDR1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP5]], i16* [[ADDR2:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP1]], i16* [[ADDR2]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP2]], i32* [[ADDR1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP4]], i16* [[ADDR2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP1]], i16* [[ADDR2]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP7]]
 ;
   %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll
similarity index 71%
rename from llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll
index 792556b..0b58da6 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll
@@ -1,9 +1,11 @@
-; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64"
 
 define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_h(
-; OPT-NOT: convert
-; OPT: ret <vscale x 8 x i1> %a
+; CHECK-LABEL: @reinterpret_test_h(
+; CHECK-NOT: convert
+; CHECK: ret <vscale x 8 x i1> %a
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
   %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
   ret <vscale x 8 x i1> %2
@@ -12,19 +14,19 @@ define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
 ; Reinterprets are not redundant because the second reinterpret zeros the
 ; lanes that don't exist within its input.
 define <vscale x 16 x i1> @reinterpret_test_h_rev(<vscale x 16 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_h_rev(
-; OPT: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
-; OPT-NEXT: ret <vscale x 16 x i1> %2
+; CHECK-LABEL: @reinterpret_test_h_rev(
+; CHECK: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
   ret <vscale x 16 x i1> %2
 }
 
 define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_w(
-; OPT-NOT: convert
-; OPT: ret <vscale x 4 x i1> %a
+; CHECK-LABEL: @reinterpret_test_w(
+; CHECK-NOT: convert
+; CHECK: ret <vscale x 4 x i1> %a
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
   %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
   ret <vscale x 4 x i1> %2
@@ -33,19 +35,19 @@ define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
 ; Reinterprets are not redundant because the second reinterpret zeros the
 ; lanes that don't exist within its input.
 define <vscale x 16 x i1> @reinterpret_test_w_rev(<vscale x 16 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_w_rev(
-; OPT: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
-; OPT-NEXT: ret <vscale x 16 x i1> %2
+; CHECK-LABEL: @reinterpret_test_w_rev(
+; CHECK: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
   ret <vscale x 16 x i1> %2
 }
 
 define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_d(
-; OPT-NOT: convert
-; OPT: ret <vscale x 2 x i1> %a
+; CHECK-LABEL: @reinterpret_test_d(
+; CHECK-NOT: convert
+; CHECK: ret <vscale x 2 x i1> %a
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
   %2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %1)
   ret <vscale x 2 x i1> %2
@@ -54,18 +56,18 @@ define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
 ; Reinterprets are not redundant because the second reinterpret zeros the
 ; lanes that don't exist within its input.
 define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_d_rev(
-; OPT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
-; OPT-NEXT: ret <vscale x 16 x i1> %2
+; CHECK-LABEL: @reinterpret_test_d_rev(
+; CHECK: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
   ret <vscale x 16 x i1> %2
 }
 
 define <vscale x 2 x i1> @reinterpret_test_full_chain(<vscale x 2 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_full_chain(
-; OPT: ret <vscale x 2 x i1> %a
+; CHECK-LABEL: @reinterpret_test_full_chain(
+; CHECK: ret <vscale x 2 x i1> %a
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
   %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
   %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -78,10 +80,10 @@ define <vscale x 2 x i1> @reinterpret_test_full_chain(<vscale x 2 x i1> %a) {
 ; The last two reinterprets are not necessary, since they are doing the same
 ; work as the first two.
 define <vscale x 4 x i1> @reinterpret_test_partial_chain(<vscale x 2 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_partial_chain(
-; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-; OPT-NEXT: ret <vscale x 4 x i1> %2
+; CHECK-LABEL: @reinterpret_test_partial_chain(
+; CHECK: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; CHECK-NEXT: ret <vscale x 4 x i1> %2
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
   %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
   %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -92,12 +94,12 @@ define <vscale x 4 x i1> @reinterpret_test_partial_chain(<vscale x 2 x i1> %a) {
 ; The chain cannot be reduced because of the second reinterpret, which causes
 ; zeroing.
 define <vscale x 8 x i1> @reinterpret_test_irreducible_chain(<vscale x 8 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_irreducible_chain(
-; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-; OPT-NEXT: %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
-; OPT-NEXT: %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
-; OPT-NEXT: ret <vscale x 8 x i1> %4
+; CHECK-LABEL: @reinterpret_test_irreducible_chain(
+; CHECK: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; CHECK-NEXT: %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+; CHECK-NEXT: %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+; CHECK-NEXT: ret <vscale x 8 x i1> %4
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
   %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
   %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -108,10 +110,10 @@ define <vscale x 8 x i1> @reinterpret_test_irreducible_chain(<vscale x 8 x i1> %
 ; Here, the candidate list is larger than the number of instructions that we
 ; end up removing.
 define <vscale x 4 x i1> @reinterpret_test_keep_some_candidates(<vscale x 8 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_keep_some_candidates(
-; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-; OPT-NEXT: ret <vscale x 4 x i1> %2
+; CHECK-LABEL: @reinterpret_test_keep_some_candidates(
+; CHECK: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; CHECK-NEXT: ret <vscale x 4 x i1> %2
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
   %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
   %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -120,12 +122,12 @@ define <vscale x 4 x i1> @reinterpret_test_keep_some_candidates(<vscale x 8 x i1
 }
 
 define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions
-; OPT-NOT: convert
-; OPT-NOT: phi <vscale x 16 x i1>
-; OPT: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
-; OPT-NOT: convert
-; OPT: ret
+; CHECK-LABEL: reinterpret_reductions
+; CHECK-NOT: convert
+; CHECK-NOT: phi <vscale x 16 x i1>
+; CHECK: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
+; CHECK-NOT: convert
+; CHECK: ret
 
 entry:
   switch i32 %cond, label %br_phi_c [
@@ -154,12 +156,12 @@ join:
 ; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1)
 ; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets.
 define <vscale x 2 x i1> @reinterpret_reductions_1(i32 %cond, <vscale x 2 x i1> %a, <vscale x 4 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions_1
-; OPT: convert
-; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
-; OPT-NOT: phi <vscale x 2 x i1>
-; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
-; OPT: ret
+; CHECK-LABEL: reinterpret_reductions_1
+; CHECK: convert
+; CHECK: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+; CHECK-NOT: phi <vscale x 2 x i1>
+; CHECK: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; CHECK: ret
 
 entry:
   switch i32 %cond, label %br_phi_c [
@@ -188,12 +190,12 @@ join:
 ; No transform. Similar to the the test above, but here only two of the arguments need to
 ; be converted to svbool.
 define <vscale x 2 x i1> @reinterpret_reductions_2(i32 %cond, <vscale x 2 x i1> %a, <vscale x 16 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions_2
-; OPT: convert
-; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
-; OPT-NOT: phi <vscale x 2 x i1>
-; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
-; OPT: ret
+; CHECK-LABEL: reinterpret_reductions_2
+; CHECK: convert
+; CHECK: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
+; CHECK-NOT: phi <vscale x 2 x i1>
+; CHECK: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; CHECK: ret
 
 entry:
   switch i32 %cond, label %br_phi_c [
@@ -221,11 +223,10 @@ join:
 ; Similar to reinterpret_reductions but the reinterprets remain because the
 ; original phi cannot be removed (i.e. prefer reinterprets over multiple phis).
 define <vscale x 16 x i1> @reinterpret_reductions3(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions3
-; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
-; OPT-NOT: phi <vscale x 2 x i1>
-; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
-; OPT-NEXT: ret <vscale x 16 x i1> %pg
+; CHECK-LABEL: reinterpret_reductions3
+; CHECK: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+; CHECK-NOT: phi <vscale x 2 x i1>
+; CHECK: ret <vscale x 16 x i1> %pg
 
 entry:
   switch i32 %cond, label %br_phi_c [
@@ -257,3 +258,5 @@ declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x
 declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
 declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
 declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+attributes #0 = { "target-features"="+sve" }