[AArch64][SVE] Move convert.{from,to}.svbool optimization into InstCombine

author Bradley Smith <bradley.smith@arm.com>

Mon, 26 Apr 2021 15:19:25 +0000 (16:19 +0100)

committer Bradley Smith <bradley.smith@arm.com>

Thu, 29 Apr 2021 11:17:42 +0000 (12:17 +0100)
author Bradley Smith <bradley.smith@arm.com>
Mon, 26 Apr 2021 15:19:25 +0000 (16:19 +0100)
committer Bradley Smith <bradley.smith@arm.com>
Thu, 29 Apr 2021 11:17:42 +0000 (12:17 +0100)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

index 73d96d9..94bc2ee 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -281,6 +281,91 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  }
  
+/// The function will remove redundant reinterprets casting in the presence
+/// of the control flow
+static Optional<Instruction *> processPhiNode(InstCombiner &IC,
+                                              IntrinsicInst &II) {
+  SmallVector<Instruction *, 32> Worklist;
+  auto RequiredType = II.getType();
+
+  auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
+  assert(PN && "Expected Phi Node!");
+
+  // Don't create a new Phi unless we can remove the old one.
+  if (!PN->hasOneUse())
+    return None;
+
+  for (Value *IncValPhi : PN->incoming_values()) {
+    auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
+    if (!Reinterpret ||
+        Reinterpret->getIntrinsicID() !=
+            Intrinsic::aarch64_sve_convert_to_svbool ||
+        RequiredType != Reinterpret->getArgOperand(0)->getType())
+      return None;
+  }
+
+  // Create the new Phi
+  LLVMContext &Ctx = PN->getContext();
+  IRBuilder<> Builder(Ctx);
+  Builder.SetInsertPoint(PN);
+  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
+  Worklist.push_back(PN);
+
+  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
+    auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
+    NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
+    Worklist.push_back(Reinterpret);
+  }
+
+  // Cleanup Phi Node and reinterprets
+  return IC.replaceInstUsesWith(II, NPN);
+}
+
+static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
+                                                            IntrinsicInst &II) {
+  // If the reinterpret instruction operand is a PHI Node
+  if (isa<PHINode>(II.getArgOperand(0)))
+    return processPhiNode(IC, II);
+
+  SmallVector<Instruction *, 32> CandidatesForRemoval;
+  Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
+
+  const auto *IVTy = cast<VectorType>(II.getType());
+
+  // Walk the chain of conversions.
+  while (Cursor) {
+    // If the type of the cursor has fewer lanes than the final result, zeroing
+    // must take place, which breaks the equivalence chain.
+    const auto *CursorVTy = cast<VectorType>(Cursor->getType());
+    if (CursorVTy->getElementCount().getKnownMinValue() <
+        IVTy->getElementCount().getKnownMinValue())
+      break;
+
+    // If the cursor has the same type as I, it is a viable replacement.
+    if (Cursor->getType() == IVTy)
+      EarliestReplacement = Cursor;
+
+    auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
+
+    // If this is not an SVE conversion intrinsic, this is the end of the chain.
+    if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_to_svbool ||
+                              IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_from_svbool))
+      break;
+
+    CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
+    Cursor = IntrinsicCursor->getOperand(0);
+  }
+
+  // If no viable replacement in the conversion chain was found, there is
+  // nothing to do.
+  if (!EarliestReplacement)
+    return None;
+
+  return IC.replaceInstUsesWith(II, EarliestReplacement);
+}
+
  static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
                                                    IntrinsicInst &II) {
    Value *Pg = II.getArgOperand(0);
@@ -368,6 +453,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
    switch (IID) {
    default:
      break;
+  case Intrinsic::aarch64_sve_convert_from_svbool:
+    return instCombineConvertFromSVBool(IC, II);
    case Intrinsic::aarch64_sve_lasta:
    case Intrinsic::aarch64_sve_lastb:
      return instCombineSVELast(IC, II);
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp

index e907400..2a11556 100644 (file)
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -11,18 +11,13 @@
  //
  // This pass performs the following optimizations:
  //
-// - removes unnecessary reinterpret intrinsics
-//   (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
-//     %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
-//     %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-//
  // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
  //     %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  //     %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  //     ; (%1 can be replaced with a reinterpret of %2)
  //
-// - optimizes ptest intrinsics and phi instructions where the operands are
-//   being needlessly converted to and from svbool_t.
+// - optimizes ptest intrinsics where the operands are being needlessly
+//   converted to and from svbool_t.
  //
  //===----------------------------------------------------------------------===//
  
@@ -75,12 +70,9 @@ private:
    /// the functions themselves.
    bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
  
-  static bool optimizeConvertFromSVBool(IntrinsicInst *I);
    static bool optimizePTest(IntrinsicInst *I);
    static bool optimizeVectorMul(IntrinsicInst *I);
    static bool optimizeTBL(IntrinsicInst *I);
-
-  static bool processPhiNode(IntrinsicInst *I);
  };
  } // end anonymous namespace
  
@@ -197,17 +189,30 @@ bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
        Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
        {MostEncompassingPTrue});
  
+  bool ConvertFromCreated = false;
    for (auto *PTrue : PTrues) {
      auto *PTrueVTy = cast<VectorType>(PTrue->getType());
  
-    Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
-    auto *ConvertFromSVBool =
-        Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
-                                {PTrueVTy}, {ConvertToSVBool});
-    PTrue->replaceAllUsesWith(ConvertFromSVBool);
+    // Only create the converts if the types are not already the same, otherwise
+    // just use the most encompassing ptrue.
+    if (MostEncompassingPTrueVTy != PTrueVTy) {
+      ConvertFromCreated = true;
+
+      Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
+      auto *ConvertFromSVBool =
+          Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
+                                  {PTrueVTy}, {ConvertToSVBool});
+      PTrue->replaceAllUsesWith(ConvertFromSVBool);
+    } else
+      PTrue->replaceAllUsesWith(MostEncompassingPTrue);
+
      PTrue->eraseFromParent();
    }
  
+  // We never used the ConvertTo so remove it
+  if (!ConvertFromCreated)
+    ConvertToSVBool->eraseFromParent();
+
    return true;
  }
  
@@ -294,51 +299,6 @@ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
    return Changed;
  }
  
-/// The function will remove redundant reinterprets casting in the presence
-/// of the control flow
-bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
-
-  SmallVector<Instruction *, 32> Worklist;
-  auto RequiredType = X->getType();
-
-  auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
-  assert(PN && "Expected Phi Node!");
-
-  // Don't create a new Phi unless we can remove the old one.
-  if (!PN->hasOneUse())
-    return false;
-
-  for (Value *IncValPhi : PN->incoming_values()) {
-    auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
-    if (!Reinterpret ||
-        RequiredType != Reinterpret->getArgOperand(0)->getType())
-      return false;
-  }
-
-  // Create the new Phi
-  LLVMContext &Ctx = PN->getContext();
-  IRBuilder<> Builder(Ctx);
-  Builder.SetInsertPoint(PN);
-  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
-  Worklist.push_back(PN);
-
-  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
-    auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
-    NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
-    Worklist.push_back(Reinterpret);
-  }
-
-  // Cleanup Phi Node and reinterprets
-  X->replaceAllUsesWith(NPN);
-  X->eraseFromParent();
-
-  for (auto &I : Worklist)
-    if (I->use_empty())
-      I->eraseFromParent();
-
-  return true;
-}
-
  bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
    IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
    IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
@@ -473,69 +433,12 @@ bool SVEIntrinsicOpts::optimizeTBL(IntrinsicInst *I) {
    return true;
  }
  
-bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
-  assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
-         "Unexpected opcode");
-
-  // If the reinterpret instruction operand is a PHI Node
-  if (isa<PHINode>(I->getArgOperand(0)))
-    return processPhiNode(I);
-
-  SmallVector<Instruction *, 32> CandidatesForRemoval;
-  Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
-
-  const auto *IVTy = cast<VectorType>(I->getType());
-
-  // Walk the chain of conversions.
-  while (Cursor) {
-    // If the type of the cursor has fewer lanes than the final result, zeroing
-    // must take place, which breaks the equivalence chain.
-    const auto *CursorVTy = cast<VectorType>(Cursor->getType());
-    if (CursorVTy->getElementCount().getKnownMinValue() <
-        IVTy->getElementCount().getKnownMinValue())
-      break;
-
-    // If the cursor has the same type as I, it is a viable replacement.
-    if (Cursor->getType() == IVTy)
-      EarliestReplacement = Cursor;
-
-    auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
-
-    // If this is not an SVE conversion intrinsic, this is the end of the chain.
-    if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
-                                  Intrinsic::aarch64_sve_convert_to_svbool ||
-                              IntrinsicCursor->getIntrinsicID() ==
-                                  Intrinsic::aarch64_sve_convert_from_svbool))
-      break;
-
-    CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
-    Cursor = IntrinsicCursor->getOperand(0);
-  }
-
-  // If no viable replacement in the conversion chain was found, there is
-  // nothing to do.
-  if (!EarliestReplacement)
-    return false;
-
-  I->replaceAllUsesWith(EarliestReplacement);
-  I->eraseFromParent();
-
-  while (!CandidatesForRemoval.empty()) {
-    Instruction *Candidate = CandidatesForRemoval.pop_back_val();
-    if (Candidate->use_empty())
-      Candidate->eraseFromParent();
-  }
-  return true;
-}
-
  bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
    IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
    if (!IntrI)
      return false;
  
    switch (IntrI->getIntrinsicID()) {
-  case Intrinsic::aarch64_sve_convert_from_svbool:
-    return optimizeConvertFromSVBool(IntrI);
    case Intrinsic::aarch64_sve_fmul:
    case Intrinsic::aarch64_sve_mul:
      return optimizeVectorMul(IntrI);
@@ -591,7 +494,6 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
        continue;
  
      switch (F.getIntrinsicID()) {
-    case Intrinsic::aarch64_sve_convert_from_svbool:
      case Intrinsic::aarch64_sve_ptest_any:
      case Intrinsic::aarch64_sve_ptest_first:
      case Intrinsic::aarch64_sve_ptest_last:
diff --git a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll

index 23a956d..bab682c 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
@@ -163,14 +163,13 @@ define <vscale x 4 x i32> @coalesce_test_same_size(i32* %addr) {
  define <vscale x 8 x i16> @coalesce_test_promoted_ptrue(i32* %addr1, i16* %addr2) {
  ; CHECK-LABEL: @coalesce_test_promoted_ptrue(
  ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], i32* [[ADDR1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP5]], i16* [[ADDR2:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP1]], i16* [[ADDR2]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP2]], i32* [[ADDR1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP4]], i16* [[ADDR2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP1]], i16* [[ADDR2]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP7]]
  ;
    %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
    %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll

similarity index 71%

rename from llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll

rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll

index 792556b..0b58da6 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll
@@ -1,9 +1,11 @@
-; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64"
  
  define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_h(
-; OPT-NOT: convert
-; OPT: ret <vscale x 8 x i1> %a
+; CHECK-LABEL: @reinterpret_test_h(
+; CHECK-NOT: convert
+; CHECK: ret <vscale x 8 x i1> %a
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
    %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
    ret <vscale x 8 x i1> %2
@@ -12,19 +14,19 @@ define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
  ; Reinterprets are not redundant because the second reinterpret zeros the
  ; lanes that don't exist within its input.
  define <vscale x 16 x i1> @reinterpret_test_h_rev(<vscale x 16 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_h_rev(
-; OPT: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
-; OPT-NEXT: ret <vscale x 16 x i1> %2
+; CHECK-LABEL: @reinterpret_test_h_rev(
+; CHECK: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
    %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
    %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
    ret <vscale x 16 x i1> %2
  }
  
  define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_w(
-; OPT-NOT: convert
-; OPT: ret <vscale x 4 x i1> %a
+; CHECK-LABEL: @reinterpret_test_w(
+; CHECK-NOT: convert
+; CHECK: ret <vscale x 4 x i1> %a
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
    %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
    ret <vscale x 4 x i1> %2
@@ -33,19 +35,19 @@ define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
  ; Reinterprets are not redundant because the second reinterpret zeros the
  ; lanes that don't exist within its input.
  define <vscale x 16 x i1> @reinterpret_test_w_rev(<vscale x 16 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_w_rev(
-; OPT: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
-; OPT-NEXT: ret <vscale x 16 x i1> %2
+; CHECK-LABEL: @reinterpret_test_w_rev(
+; CHECK: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
    %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
    %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
    ret <vscale x 16 x i1> %2
  }
  
  define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_d(
-; OPT-NOT: convert
-; OPT: ret <vscale x 2 x i1> %a
+; CHECK-LABEL: @reinterpret_test_d(
+; CHECK-NOT: convert
+; CHECK: ret <vscale x 2 x i1> %a
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
    %2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %1)
    ret <vscale x 2 x i1> %2
@@ -54,18 +56,18 @@ define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
  ; Reinterprets are not redundant because the second reinterpret zeros the
  ; lanes that don't exist within its input.
  define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_d_rev(
-; OPT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
-; OPT-NEXT: ret <vscale x 16 x i1> %2
+; CHECK-LABEL: @reinterpret_test_d_rev(
+; CHECK: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
    %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
    %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
    ret <vscale x 16 x i1> %2
  }
  
  define <vscale x 2 x i1> @reinterpret_test_full_chain(<vscale x 2 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_full_chain(
-; OPT: ret <vscale x 2 x i1> %a
+; CHECK-LABEL: @reinterpret_test_full_chain(
+; CHECK: ret <vscale x 2 x i1> %a
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
    %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
    %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -78,10 +80,10 @@ define <vscale x 2 x i1> @reinterpret_test_full_chain(<vscale x 2 x i1> %a) {
  ; The last two reinterprets are not necessary, since they are doing the same
  ; work as the first two.
  define <vscale x 4 x i1> @reinterpret_test_partial_chain(<vscale x 2 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_partial_chain(
-; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-; OPT-NEXT: ret <vscale x 4 x i1> %2
+; CHECK-LABEL: @reinterpret_test_partial_chain(
+; CHECK: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; CHECK-NEXT: ret <vscale x 4 x i1> %2
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
    %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
    %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -92,12 +94,12 @@ define <vscale x 4 x i1> @reinterpret_test_partial_chain(<vscale x 2 x i1> %a) {
  ; The chain cannot be reduced because of the second reinterpret, which causes
  ; zeroing.
  define <vscale x 8 x i1> @reinterpret_test_irreducible_chain(<vscale x 8 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_irreducible_chain(
-; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-; OPT-NEXT: %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
-; OPT-NEXT: %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
-; OPT-NEXT: ret <vscale x 8 x i1> %4
+; CHECK-LABEL: @reinterpret_test_irreducible_chain(
+; CHECK: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; CHECK-NEXT: %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+; CHECK-NEXT: %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+; CHECK-NEXT: ret <vscale x 8 x i1> %4
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
    %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
    %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -108,10 +110,10 @@ define <vscale x 8 x i1> @reinterpret_test_irreducible_chain(<vscale x 8 x i1> %
  ; Here, the candidate list is larger than the number of instructions that we
  ; end up removing.
  define <vscale x 4 x i1> @reinterpret_test_keep_some_candidates(<vscale x 8 x i1> %a) {
-; OPT-LABEL: @reinterpret_test_keep_some_candidates(
-; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
-; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
-; OPT-NEXT: ret <vscale x 4 x i1> %2
+; CHECK-LABEL: @reinterpret_test_keep_some_candidates(
+; CHECK: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+; CHECK-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; CHECK-NEXT: ret <vscale x 4 x i1> %2
    %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
    %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
    %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
@@ -120,12 +122,12 @@ define <vscale x 4 x i1> @reinterpret_test_keep_some_candidates(<vscale x 8 x i1
  }
  
  define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions
-; OPT-NOT: convert
-; OPT-NOT: phi <vscale x 16 x i1>
-; OPT: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
-; OPT-NOT: convert
-; OPT: ret
+; CHECK-LABEL: reinterpret_reductions
+; CHECK-NOT: convert
+; CHECK-NOT: phi <vscale x 16 x i1>
+; CHECK: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
+; CHECK-NOT: convert
+; CHECK: ret
  
  entry:
    switch i32 %cond, label %br_phi_c [
@@ -154,12 +156,12 @@ join:
  ; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1)
  ; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets.
  define <vscale x 2 x i1> @reinterpret_reductions_1(i32 %cond, <vscale x 2 x i1> %a, <vscale x 4 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions_1
-; OPT: convert
-; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
-; OPT-NOT: phi <vscale x 2 x i1>
-; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
-; OPT: ret
+; CHECK-LABEL: reinterpret_reductions_1
+; CHECK: convert
+; CHECK: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+; CHECK-NOT: phi <vscale x 2 x i1>
+; CHECK: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; CHECK: ret
  
  entry:
    switch i32 %cond, label %br_phi_c [
@@ -188,12 +190,12 @@ join:
  ; No transform. Similar to the the test above, but here only two of the arguments need to
  ; be converted to svbool.
  define <vscale x 2 x i1> @reinterpret_reductions_2(i32 %cond, <vscale x 2 x i1> %a, <vscale x 16 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions_2
-; OPT: convert
-; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
-; OPT-NOT: phi <vscale x 2 x i1>
-; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
-; OPT: ret
+; CHECK-LABEL: reinterpret_reductions_2
+; CHECK: convert
+; CHECK: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
+; CHECK-NOT: phi <vscale x 2 x i1>
+; CHECK: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; CHECK: ret
  
  entry:
    switch i32 %cond, label %br_phi_c [
@@ -221,11 +223,10 @@ join:
  ; Similar to reinterpret_reductions but the reinterprets remain because the
  ; original phi cannot be removed (i.e. prefer reinterprets over multiple phis).
  define <vscale x 16 x i1> @reinterpret_reductions3(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
-; OPT-LABEL: reinterpret_reductions3
-; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
-; OPT-NOT: phi <vscale x 2 x i1>
-; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
-; OPT-NEXT: ret <vscale x 16 x i1> %pg
+; CHECK-LABEL: reinterpret_reductions3
+; CHECK: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+; CHECK-NOT: phi <vscale x 2 x i1>
+; CHECK: ret <vscale x 16 x i1> %pg
  
  entry:
    switch i32 %cond, label %br_phi_c [
@@ -257,3 +258,5 @@ declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x
  declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
  declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
  declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+attributes #0 = { "target-features"="+sve" }
author	Bradley Smith <bradley.smith@arm.com>
	Mon, 26 Apr 2021 15:19:25 +0000 (16:19 +0100)
committer	Bradley Smith <bradley.smith@arm.com>
	Thu, 29 Apr 2021 11:17:42 +0000 (12:17 +0100)
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll		patch \| blob \| history
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-reinterpret.ll	[moved from llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll with 71% similarity]	patch \| blob \| history