From: Alexey Bataev Date: Thu, 16 Dec 2021 16:55:52 +0000 (-0800) Subject: [SLP]Alternate vectorization for cmp instructions. X-Git-Tag: upstream/15.0.7~17934 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ad2a0ccf8f9816fd7805138bbf582117246f5489;p=platform%2Fupstream%2Fllvm.git [SLP]Alternate vectorization for cmp instructions. Added support for alternate ops vectorization of the cmp instructions. It allows to vectorize either cmp instructions with same/swapped predicate but different (swapped) operands kinds or cmp instructions with different predicates and compatible operands kinds. Differential Revision: https://reviews.llvm.org/D115955 --- diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 25bf697..ebf5f78 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -471,17 +471,36 @@ static bool isValidForAlternation(unsigned Opcode) { return true; } +static InstructionsState getSameOpcode(ArrayRef VL, + unsigned BaseIndex = 0); + +/// Checks if the provided operands of 2 cmp instructions are compatible, i.e. +/// compatible instructions or constants, or just some other regular values. +static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, + Value *Op1) { + return (isConstant(BaseOp0) && isConstant(Op0)) || + (isConstant(BaseOp1) && isConstant(Op1)) || + (!isa(BaseOp0) && !isa(Op0) && + !isa(BaseOp1) && !isa(Op1)) || + getSameOpcode({BaseOp0, Op0}).getOpcode() || + getSameOpcode({BaseOp1, Op1}).getOpcode(); +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, - unsigned BaseIndex = 0) { + unsigned BaseIndex) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); + bool IsCmpOp = isa(VL[BaseIndex]); + CmpInst::Predicate BasePred = + IsCmpOp ? cast(VL[BaseIndex])->getPredicate() + : CmpInst::BAD_ICMP_PREDICATE; unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; @@ -514,6 +533,57 @@ static InstructionsState getSameOpcode(ArrayRef VL, continue; } } + } else if (IsCmpOp && isa(VL[Cnt])) { + auto *BaseInst = cast(VL[BaseIndex]); + auto *Inst = cast(VL[Cnt]); + Type *Ty0 = BaseInst->getOperand(0)->getType(); + Type *Ty1 = Inst->getOperand(0)->getType(); + if (Ty0 == Ty1) { + Value *BaseOp0 = BaseInst->getOperand(0); + Value *BaseOp1 = BaseInst->getOperand(1); + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + CmpInst::Predicate CurrentPred = + cast(VL[Cnt])->getPredicate(); + CmpInst::Predicate SwappedCurrentPred = + CmpInst::getSwappedPredicate(CurrentPred); + // Check for compatible operands. If the corresponding operands are not + // compatible - need to perform alternate vectorization. + if (InstOpcode == Opcode) { + if (BasePred == CurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) + continue; + if (BasePred == SwappedCurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)) + continue; + if (E == 2 && + (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) + continue; + auto *AltInst = cast(VL[AltIndex]); + CmpInst::Predicate AltPred = AltInst->getPredicate(); + Value *AltOp0 = AltInst->getOperand(0); + Value *AltOp1 = AltInst->getOperand(1); + // Check if operands are compatible with alternate operands. + if (AltPred == CurrentPred && + areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1)) + continue; + if (AltPred == SwappedCurrentPred && + areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0)) + continue; + } + if (BaseIndex == AltIndex && BasePred != CurrentPred) { + assert(isValidForAlternation(Opcode) && + isValidForAlternation(InstOpcode) && + "Cast isn't safe for alternation, logic needs to be updated!"); + AltIndex = Cnt; + continue; + } + auto *AltInst = cast(VL[AltIndex]); + CmpInst::Predicate AltPred = AltInst->getPredicate(); + if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || + AltPred == CurrentPred || AltPred == SwappedCurrentPred) + continue; + } } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; return InstructionsState(VL[BaseIndex], nullptr, nullptr); @@ -4354,9 +4424,41 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. - if (isa(VL0)) { + auto *CI = dyn_cast(VL0); + if (isa(VL0) || CI) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + if (!CI || all_of(VL, [](Value *V) { + return cast(V)->isCommutative(); + })) { + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + } else { + CmpInst::Predicate P0 = CI->getPredicate(); + CmpInst::Predicate AltP0 = cast(S.AltOp)->getPredicate(); + assert(P0 != AltP0 && + "Expected different main/alternate predicates."); + CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); + Value *BaseOp0 = VL0->getOperand(0); + Value *BaseOp1 = VL0->getOperand(1); + // Collect operands - commute if it uses the swapped predicate or + // alternate operation. + for (Value *V : VL) { + auto *Cmp = cast(V); + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + CmpInst::Predicate CurrentPred = Cmp->getPredicate(); + if (P0 == AltP0Swapped) { + if ((P0 == CurrentPred && + !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || + (AltP0 == CurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))) + std::swap(LHS, RHS); + } else if (P0 != CurrentPred && AltP0 != CurrentPred) { + std::swap(LHS, RHS); + } + Left.push_back(LHS); + Right.push_back(RHS); + } + } TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -5288,7 +5390,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode()))) && + Instruction::isCast(E->getAltOpcode())) || + (isa(VL0) && isa(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); InstructionCost ScalarCost = 0; if (NeedToShuffleReuses) { @@ -5336,6 +5439,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + } else if (auto *CI0 = dyn_cast(VL0)) { + VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + Builder.getInt1Ty(), + CI0->getPredicate(), CostKind, VL0); + VecCost += TTI->getCmpSelInstrCost( + E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + cast(E->getAltOp())->getPredicate(), CostKind, + E->getAltOp()); } else { Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); @@ -5352,6 +5463,27 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + if (auto *CI0 = dyn_cast(E->getMainOp())) { + auto *AltCI0 = cast(E->getAltOp()); + auto *CI = cast(I); + CmpInst::Predicate P0 = CI0->getPredicate(); + CmpInst::Predicate AltP0 = AltCI0->getPredicate(); + assert(P0 != AltP0 && + "Expected different main/alternate predicates."); + CmpInst::Predicate AltP0Swapped = + CmpInst::getSwappedPredicate(AltP0); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + if (P0 == AltP0Swapped) + return (P0 == CurrentPred && + !areCompatibleCmpOps( + CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(0), CI->getOperand(1))) || + (AltP0 == CurrentPred && + !areCompatibleCmpOps( + CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(1), CI->getOperand(0))); + return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; + } return I->getOpcode() == E->getAltOpcode(); }, Mask); @@ -6834,11 +6966,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode()))) && + Instruction::isCast(E->getAltOpcode())) || + (isa(VL0) && isa(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); Value *LHS = nullptr, *RHS = nullptr; - if (Instruction::isBinaryOp(E->getOpcode())) { + if (Instruction::isBinaryOp(E->getOpcode()) || isa(VL0)) { setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); @@ -6858,6 +6991,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { static_cast(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( static_cast(E->getAltOpcode()), LHS, RHS); + } else if (auto *CI0 = dyn_cast(VL0)) { + V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); + auto *AltCI = cast(E->getAltOp()); + CmpInst::Predicate AltPred = AltCI->getPredicate(); + unsigned AltIdx = + std::distance(E->Scalars.begin(), find(E->Scalars, AltCI)); + if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx]) + AltPred = CmpInst::getSwappedPredicate(AltPred); + V1 = Builder.CreateCmp(AltPred, LHS, RHS); } else { V0 = Builder.CreateCast( static_cast(E->getOpcode()), LHS, VecTy); @@ -6882,6 +7024,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + if (auto *CI0 = dyn_cast(E->getMainOp())) { + auto *AltCI0 = cast(E->getAltOp()); + auto *CI = cast(I); + CmpInst::Predicate P0 = CI0->getPredicate(); + CmpInst::Predicate AltP0 = AltCI0->getPredicate(); + assert(P0 != AltP0 && + "Expected different main/alternate predicates."); + CmpInst::Predicate AltP0Swapped = + CmpInst::getSwappedPredicate(AltP0); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + if (P0 == AltP0Swapped) + return (P0 == CurrentPred && + !areCompatibleCmpOps( + CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(0), CI->getOperand(1))) || + (AltP0 == CurrentPred && + !areCompatibleCmpOps( + CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(1), CI->getOperand(0))); + return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; + } return I->getOpcode() == E->getAltOpcode(); }, Mask, &OpScalars, &AltScalars); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll index 8f8931d..0ba0fb8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s ; ; Check that we can commute operands based on the predicate. @@ -235,46 +235,14 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) { } define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { -; SSE-LABEL: @fcmp_ord_uno_v4i32( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* -; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1 -; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> -; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> -; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> -; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> -; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> -; SSE-NEXT: ret <4 x i32> [[R]] -; -; AVX-LABEL: @fcmp_ord_uno_v4i32( -; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* -; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] -; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i64 0 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> -; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> -; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3 -; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> -; AVX-NEXT: ret <4 x i32> [[R]] +; CHECK-LABEL: @fcmp_ord_uno_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll index 3180078..8cdd2c4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s ; ; Check that we can commute operands based on the predicate. @@ -235,46 +235,14 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) { } define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { -; SSE-LABEL: @fcmp_ord_uno_v4i32( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* -; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1 -; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> -; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> -; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> -; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> -; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> -; SSE-NEXT: ret <4 x i32> [[R]] -; -; AVX-LABEL: @fcmp_ord_uno_v4i32( -; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* -; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] -; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i64 0 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> -; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> -; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3 -; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> -; AVX-NEXT: ret <4 x i32> [[R]] +; CHECK-LABEL: @fcmp_ord_uno_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index ee3481e2..0d6ebc2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -438,27 +438,44 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; SSE-NEXT: [[S4:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; SSE-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP8]], i1 false +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP9]], i1 false +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP10]], i1 false +; SSE-NEXT: ret i1 [[S7]] +; +; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1