#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <algorithm>
using namespace llvm;
using namespace llvm::PatternMatch;
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *Pg = II.getArgOperand(0);
+ Value *Vec = II.getArgOperand(1);
+ bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
+
+ auto *C = dyn_cast<Constant>(Pg);
+ if (IsAfter && C && C->isNullValue()) {
+ // The intrinsic is extracting lane 0 so use an extract instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
+ Extract->insertBefore(&II);
+ Extract->takeName(&II);
+ return IC.replaceInstUsesWith(II, Extract);
+ }
+
+ auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
+ if (!IntrPG)
+ return None;
+
+ if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ return None;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
+
+ // Can the intrinsic's predicate be converted to a known constant index?
+ unsigned Idx;
+ switch (PTruePattern) {
+ default:
+ return None;
+ case AArch64SVEPredPattern::vl1:
+ Idx = 0;
+ break;
+ case AArch64SVEPredPattern::vl2:
+ Idx = 1;
+ break;
+ case AArch64SVEPredPattern::vl3:
+ Idx = 2;
+ break;
+ case AArch64SVEPredPattern::vl4:
+ Idx = 3;
+ break;
+ case AArch64SVEPredPattern::vl5:
+ Idx = 4;
+ break;
+ case AArch64SVEPredPattern::vl6:
+ Idx = 5;
+ break;
+ case AArch64SVEPredPattern::vl7:
+ Idx = 6;
+ break;
+ case AArch64SVEPredPattern::vl8:
+ Idx = 7;
+ break;
+ case AArch64SVEPredPattern::vl16:
+ Idx = 15;
+ break;
+ }
+
+ // Increment the index if extracting the element after the last active
+ // predicate element.
+ if (IsAfter)
+ ++Idx;
+
+ // Ignore extracts whose index is larger than the known minimum vector
+ // length. NOTE: This is an artificial constraint where we prefer to
+ // maintain what the user asked for until an alternative is proven faster.
+ auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
+ if (Idx >= PgVTy->getMinNumElements())
+ return None;
+
+ // The intrinsic is extracting a fixed lane so use an extract instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
+ Extract->insertBefore(&II);
+ Extract->takeName(&II);
+ return IC.replaceInstUsesWith(II, Extract);
+}
+
+Optional<Instruction *>
+AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const {
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_sve_lasta:
+ case Intrinsic::aarch64_sve_lastb:
+ return instCombineSVELast(IC, II);
+ }
+
+ return None;
+}
+
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck --check-prefix OPT %s
+
+target triple = "aarch64"
+
+; Most of the testing is covered by the lastb cases, but here we ensure that
+; lasta with a predicate having no active lanes is treated as an alias to
+; extracting the first vector element.
+define i8 @lasta_extractelement_0(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lasta_extractelement_0(
+; OPT-NEXT: [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
+; OPT-NEXT: ret i8 [[E0]]
+;
+ %e0 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %v)
+ ret i8 %e0
+}
+
+; Most of the testing is covered by the lastb cases, but here we check the
+; resulting extraction index is one more than the lastb case because lasta
+; extracts the element after the last active.
+define i8 @lasta_extractelement_8(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lasta_extractelement_8(
+; OPT-NEXT: [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 8
+; OPT-NEXT: ret i8 [[E1]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
+ %e1 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e1
+}
+
+define i8 @lastb_extractelement_0(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_0(
+; OPT-NEXT: [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
+; OPT-NEXT: ret i8 [[E0]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
+ %e0 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e0
+}
+
+define i8 @lastb_extractelement_1(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_1(
+; OPT-NEXT: [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 1
+; OPT-NEXT: ret i8 [[E1]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
+ %e1 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e1
+}
+
+define i8 @lastb_extractelement_2(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_2(
+; OPT-NEXT: [[E2:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 2
+; OPT-NEXT: ret i8 [[E2]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 3)
+ %e2 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e2
+}
+
+define i8 @lastb_extractelement_3(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_3(
+; OPT-NEXT: [[E3:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 3
+; OPT-NEXT: ret i8 [[E3]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 4)
+ %e3 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e3
+}
+
+define i8 @lastb_extractelement_4(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_4(
+; OPT-NEXT: [[E4:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 4
+; OPT-NEXT: ret i8 [[E4]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
+ %e4 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e4
+}
+
+define i8 @lastb_extractelement_5(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_5(
+; OPT-NEXT: [[E5:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 5
+; OPT-NEXT: ret i8 [[E5]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 6)
+ %e5 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e5
+}
+
+define i8 @lastb_extractelement_6(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_6(
+; OPT-NEXT: [[E6:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 6
+; OPT-NEXT: ret i8 [[E6]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 7)
+ %e6 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e6
+}
+
+define i8 @lastb_extractelement_7(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_7(
+; OPT-NEXT: [[E7:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 7
+; OPT-NEXT: ret i8 [[E7]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
+ %e7 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e7
+}
+
+define i8 @lastb_extractelement_15(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_15(
+; OPT-NEXT: [[E15:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 15
+; OPT-NEXT: ret i8 [[E15]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 9)
+ %e15 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e15
+}
+
+; No transformation because the requested element is beyond the range of the
+; known minimum element count so we maintain the user's intentions.
+define i8 @lastb_extractelement_31(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_31(
+; OPT-NEXT: [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
+; OPT-NEXT: [[E31:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
+; OPT-NEXT: ret i8 [[E31]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
+ %e31 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e31
+}
+
+; No transformation because the ptrue's predicate pattern is bogus and thus
+; nothing can be inferred about the result.
+define i8 @lastb_extractelement_invalid_predicate_pattern(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_invalid_predicate_pattern(
+; OPT-NEXT: [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
+; OPT-NEXT: [[E:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
+; OPT-NEXT: ret i8 [[E]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
+ %e = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
+declare i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
+
+attributes #0 = { "target-features"="+sve" }