From 5281f0dab2398fdbc60fc7131f8ad2438600e7ae Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 1 Feb 2022 11:37:15 +0100 Subject: [PATCH] Revert "[SLP]Alternate vectorization for cmp instructions." This reverts commit afaaecc88c6e5989de8a6a0266610860ef99d9d6. Crashes when compiling SciPy, test case https://reviews.llvm.org/P8276 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 167 +------------ .../PhaseOrdering/X86/vector-reductions-logical.ll | 57 +++-- .../SLPVectorizer/X86/cmp_commute-inseltpoison.ll | 52 +++- .../Transforms/SLPVectorizer/X86/cmp_commute.ll | 52 +++- .../SLPVectorizer/X86/horizontal-minmax.ll | 24 +- .../SLPVectorizer/X86/reduction-logical.ll | 273 ++++++++------------- 6 files changed, 232 insertions(+), 393 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ae7c294..25bf697 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -471,36 +471,17 @@ static bool isValidForAlternation(unsigned Opcode) { return true; } -static InstructionsState getSameOpcode(ArrayRef VL, - unsigned BaseIndex = 0); - -/// Checks if the provided operands of 2 cmp instructions are compatible, i.e. -/// compatible instructions or constants, or just some other regular values. -static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, - Value *Op1) { - return (isConstant(BaseOp0) && isConstant(Op0)) || - (isConstant(BaseOp1) && isConstant(Op1)) || - (!isa(BaseOp0) && !isa(Op0) && - !isa(BaseOp1) && !isa(Op1)) || - getSameOpcode({BaseOp0, Op0}).getOpcode() || - getSameOpcode({BaseOp1, Op1}).getOpcode(); -} - /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, - unsigned BaseIndex) { + unsigned BaseIndex = 0) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); - bool IsCmpOp = isa(VL[BaseIndex]); - CmpInst::Predicate BasePred = - IsCmpOp ? cast(VL[BaseIndex])->getPredicate() - : CmpInst::BAD_ICMP_PREDICATE; unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; @@ -533,47 +514,6 @@ static InstructionsState getSameOpcode(ArrayRef VL, continue; } } - } else if (IsCmpOp && isa(VL[Cnt])) { - auto *BaseInst = cast(VL[BaseIndex]); - auto *Inst = cast(VL[Cnt]); - Type *Ty0 = BaseInst->getOperand(0)->getType(); - Type *Ty1 = Inst->getOperand(0)->getType(); - if (Ty0 == Ty1) { - Value *BaseOp0 = BaseInst->getOperand(0); - Value *BaseOp1 = BaseInst->getOperand(1); - Value *Op0 = Inst->getOperand(0); - Value *Op1 = Inst->getOperand(1); - CmpInst::Predicate CurrentPred = - cast(VL[Cnt])->getPredicate(); - // Check for compatible operands. If the corresponding operands are not - // compatible - need to perform alternate vectorization. - if (InstOpcode == Opcode) { - if (BasePred == CurrentPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) - continue; - if (BasePred == CmpInst::getSwappedPredicate(CurrentPred) && - areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)) - continue; - auto *AltInst = cast(VL[AltIndex]); - CmpInst::Predicate AltPred = AltInst->getPredicate(); - Value *AltOp0 = AltInst->getOperand(0); - Value *AltOp1 = AltInst->getOperand(1); - // Check if operands are compatible with alternate operands. - if (AltPred == CurrentPred && - areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1)) - continue; - if (AltPred == CmpInst::getSwappedPredicate(CurrentPred) && - areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0)) - continue; - } - if (BaseIndex == AltIndex) { - assert(isValidForAlternation(Opcode) && - isValidForAlternation(InstOpcode) && - "Cast isn't safe for alternation, logic needs to be updated!"); - AltIndex = Cnt; - continue; - } - } } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; return InstructionsState(VL[BaseIndex], nullptr, nullptr); @@ -4414,41 +4354,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. - auto *CI = dyn_cast(VL0); - if (isa(VL0) || CI) { + if (isa(VL0)) { ValueList Left, Right; - if (!CI || all_of(VL, [](Value *V) { - return cast(V)->isCommutative(); - })) { - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); - } else { - CmpInst::Predicate P0 = CI->getPredicate(); - CmpInst::Predicate AltP0 = cast(S.AltOp)->getPredicate(); - CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); - Value *BaseOp0 = VL0->getOperand(0); - Value *BaseOp1 = VL0->getOperand(1); - // Collect operands - commute if it uses the swapped predicate or - // alternate operation. - for (Value *V : VL) { - auto *Cmp = cast(V); - Value *LHS = Cmp->getOperand(0); - Value *RHS = Cmp->getOperand(1); - CmpInst::Predicate CurrentPred = CI->getPredicate(); - CmpInst::Predicate CurrentPredSwapped = - CmpInst::getSwappedPredicate(CurrentPred); - if (P0 == AltP0 || P0 == AltP0Swapped) { - if ((P0 == CurrentPred && - !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || - (P0 == CurrentPredSwapped && - !areCompatibleCmpOps(BaseOp0, BaseOp1, RHS, LHS))) - std::swap(LHS, RHS); - } else if (!areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) { - std::swap(LHS, RHS); - } - Left.push_back(LHS); - Right.push_back(RHS); - } - } + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -5380,8 +5288,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode())) || - (isa(VL0) && isa(E->getAltOp()))) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); InstructionCost ScalarCost = 0; if (NeedToShuffleReuses) { @@ -5429,14 +5336,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); - } else if (auto *CI0 = dyn_cast(VL0)) { - VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, - Builder.getInt1Ty(), - CI0->getPredicate(), CostKind, VL0); - VecCost += TTI->getCmpSelInstrCost( - E->getOpcode(), ScalarTy, Builder.getInt1Ty(), - cast(E->getAltOp())->getPredicate(), CostKind, - E->getAltOp()); } else { Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); @@ -5453,28 +5352,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - if (auto *CI0 = dyn_cast(E->getMainOp())) { - auto *AltCI0 = cast(E->getAltOp()); - auto *CI = cast(I); - CmpInst::Predicate P0 = CI0->getPredicate(); - CmpInst::Predicate AltP0 = AltCI0->getPredicate(); - CmpInst::Predicate AltP0Swapped = - CmpInst::getSwappedPredicate(AltP0); - CmpInst::Predicate CurrentPred = CI->getPredicate(); - CmpInst::Predicate CurrentPredSwapped = - CmpInst::getSwappedPredicate(CurrentPred); - if (P0 == AltP0 || P0 == AltP0Swapped) { - unsigned Idx = - std::distance(E->Scalars.begin(), find(E->Scalars, I)); - // Alternate cmps have same/swapped predicate as main cmps but - // different order of compatible operands. - ArrayRef VLOp0 = E->getOperand(0); - return (P0 == CurrentPred && CI->getOperand(0) != VLOp0[Idx]) || - (P0 == CurrentPredSwapped && - CI->getOperand(1) != VLOp0[Idx]); - } - return CurrentPred != P0 && CurrentPredSwapped != P0; - } return I->getOpcode() == E->getAltOpcode(); }, Mask); @@ -6957,12 +6834,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode())) || - (isa(VL0) && isa(E->getAltOp()))) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); Value *LHS = nullptr, *RHS = nullptr; - if (Instruction::isBinaryOp(E->getOpcode()) || isa(VL0)) { + if (Instruction::isBinaryOp(E->getOpcode())) { setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); @@ -6982,15 +6858,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { static_cast(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( static_cast(E->getAltOpcode()), LHS, RHS); - } else if (auto *CI0 = dyn_cast(VL0)) { - V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); - auto *AltCI = cast(E->getAltOp()); - CmpInst::Predicate AltPred = AltCI->getPredicate(); - unsigned AltIdx = - std::distance(E->Scalars.begin(), find(E->Scalars, AltCI)); - if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx]) - AltPred = CmpInst::getSwappedPredicate(AltPred); - V1 = Builder.CreateCmp(AltPred, LHS, RHS); } else { V0 = Builder.CreateCast( static_cast(E->getOpcode()), LHS, VecTy); @@ -7015,28 +6882,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - if (auto *CI0 = dyn_cast(E->getMainOp())) { - auto *AltCI0 = cast(E->getAltOp()); - auto *CI = cast(I); - CmpInst::Predicate P0 = CI0->getPredicate(); - CmpInst::Predicate AltP0 = AltCI0->getPredicate(); - CmpInst::Predicate AltP0Swapped = - CmpInst::getSwappedPredicate(AltP0); - CmpInst::Predicate CurrentPred = CI->getPredicate(); - CmpInst::Predicate CurrentPredSwapped = - CmpInst::getSwappedPredicate(CurrentPred); - if (P0 == AltP0 || P0 == AltP0Swapped) { - unsigned Idx = - std::distance(E->Scalars.begin(), find(E->Scalars, I)); - // Alternate cmps have same/swapped predicate as main cmps but - // different order of compatible operands. - ArrayRef VLOp0 = E->getOperand(0); - return (P0 == CurrentPred && CI->getOperand(0) != VLOp0[Idx]) || - (P0 == CurrentPredSwapped && - CI->getOperand(1) != VLOp0[Idx]); - } - return CurrentPred != P0 && CurrentPredSwapped != P0; - } return I->getOpcode() == E->getAltOpcode(); }, Mask, &OpScalars, &AltScalars); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index 83cab29..486faf7 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -90,17 +90,24 @@ return: define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <8 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[SHIFT]], [[T]] -; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[T:%.*]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[T]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[T]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[T]], i64 0 +; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 +; CHECK-NEXT: [[CMP19:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP19]] +; CHECK-NEXT: [[CMP24:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP24]] +; CHECK-NEXT: [[CMP29:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP29]] +; CHECK-NEXT: [[CMP34:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP34]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: @@ -413,18 +420,24 @@ return: define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-LABEL: @test_merge_anyof_v4si( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T]] -; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP5]], i64 0 +; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[T_FR]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 +; CHECK-NEXT: [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 +; CHECK-NEXT: [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 +; CHECK-NEXT: [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 +; CHECK-NEXT: [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP8]], i64 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00 +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll index 0ba0fb8..8f8931d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; ; Check that we can commute operands based on the predicate. @@ -235,14 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) { } define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { -; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE-LABEL: @fcmp_ord_uno_v4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1 +; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> +; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> +; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> +; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[R]] +; +; AVX-LABEL: @fcmp_ord_uno_v4i32( +; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] +; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] +; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i64 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> +; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3 +; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; AVX-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll index 8cdd2c4..3180078 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; ; Check that we can commute operands based on the predicate. @@ -235,14 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) { } define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { -; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE-LABEL: @fcmp_ord_uno_v4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1 +; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> +; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> +; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> +; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[R]] +; +; AVX-LABEL: @fcmp_ord_uno_v4i32( +; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] +; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] +; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i64 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> +; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3 +; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; AVX-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index a00c951..eb317de 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -913,22 +913,18 @@ define i32 @maxi8_mutiple_uses(i32) { ; THRESH-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]] ; THRESH-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP6]] ; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1 ; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP3]], i32 1 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1 ; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]] -; THRESH-NEXT: [[TMP16:%.*]] = icmp slt <2 x i32> [[TMP12]], [[TMP14]] -; THRESH-NEXT: [[TMP17:%.*]] = shufflevector <2 x i1> [[TMP15]], <2 x i1> [[TMP16]], <2 x i32> -; THRESH-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP20:%.*]] = select <2 x i1> [[TMP17]], <2 x i32> [[TMP18]], <2 x i32> [[TMP19]] -; THRESH-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0 -; THRESH-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1 -; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]] -; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP21]], i32 [[TMP22]] -; THRESH-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP17]], i32 1 -; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 3, i32 4 -; THRESH-NEXT: store i32 [[TMP24]], i32* @var, align 8 +; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]] +; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 +; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 +; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP17]], i32 [[TMP18]] +; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; THRESH-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 3, i32 4 +; THRESH-NEXT: store i32 [[TMP20]], i32* @var, align 8 ; THRESH-NEXT: ret i32 [[OP_EXTRA1]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 8ee7963..ee3481e2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -214,13 +214,19 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) { define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) -; CHECK-NEXT: ret i1 [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false +; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -245,53 +251,28 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; SSE-NEXT: call void @use1(i1 [[C2]]) -; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3 -; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], -; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> -; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false -; SSE-NEXT: [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false -; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; SSE-NEXT: ret i1 [[S7]] -; -; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], -; AVX-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 -; AVX-NEXT: call void @use1(i1 [[TMP4]]) -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 -; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP5]], i1 false -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP4]], i1 false -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP7]], i1 false -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 -; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[TMP8]], i1 false -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 -; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP9]], i1 false -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 -; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP10]], i1 false -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 -; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP11]], i1 false -; AVX-NEXT: ret i1 [[S7]] +; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; CHECK-NEXT: call void @use1(i1 [[C2]]) +; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -317,49 +298,28 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select( -; SSE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; SSE-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP4]], 42 -; SSE-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP3]], 42 -; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP2]], 42 -; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[X]], -; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> -; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[TMP1]], 17 -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; SSE-NEXT: call void @use1(i1 [[S2]]) -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[S2]], i1 false -; SSE-NEXT: [[S7:%.*]] = select i1 [[OP_EXTRA]], i1 [[D3]], i1 false -; SSE-NEXT: ret i1 [[S7]] -; -; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select( -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], -; AVX-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 -; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP5]], i1 [[TMP4]], i1 false -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false -; AVX-NEXT: call void @use1(i1 [[S2]]) -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP7]], i1 false -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 -; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[TMP8]], i1 false -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 -; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP9]], i1 false -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 -; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP10]], i1 false -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 -; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP11]], i1 false -; AVX-NEXT: ret i1 [[S7]] +; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select( +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; CHECK-NEXT: call void @use1(i1 [[S2]]) +; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -435,47 +395,25 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { } define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_partial( -; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3 -; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], -; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> -; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false -; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; SSE-NEXT: ret i1 [[S7]] -; -; AVX-LABEL: @logical_and_icmp_clamp_partial( -; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; AVX-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false -; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; AVX-NEXT: ret i1 [[S7]] +; CHECK-LABEL: @logical_and_icmp_clamp_partial( +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; CHECK-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -500,44 +438,27 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( -; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 -; SSE-NEXT: [[S4:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false -; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 -; SSE-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP8]], i1 false -; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 -; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP9]], i1 false -; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 -; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP10]], i1 false -; SSE-NEXT: ret i1 [[S7]] -; -; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( -; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; AVX-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 -; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false -; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; AVX-NEXT: ret i1 [[S7]] +; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; CHECK-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 +; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 -- 2.7.4