From 58fc00d0bc1d33afbecfd7d83fb60529b7decb40 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 10 Oct 2018 20:39:39 +0000 Subject: [PATCH] revert r344082: [InstCombine] reverse 'trunc X to ' canonicalization This commit accidentally included the diffs from D53057. llvm-svn: 344178 --- .../Transforms/InstCombine/InstCombineCasts.cpp | 31 +- .../Transforms/InstCombine/InstCombineCompares.cpp | 7 - .../InstCombine/InstCombineVectorOps.cpp | 30 -- llvm/test/Transforms/InstCombine/apint-shift.ll | 4 +- .../test/Transforms/InstCombine/apint-shl-trunc.ll | 5 +- llvm/test/Transforms/InstCombine/icmp.ll | 20 +- llvm/test/Transforms/InstCombine/vec_shuffle.ll | 5 +- llvm/test/Transforms/InstCombine/vector-casts.ll | 19 +- .../LoopVectorize/X86/masked_load_store.ll | 336 +++++++++++---------- 9 files changed, 212 insertions(+), 245 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 74f1e69..fd59c3a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -706,35 +706,12 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (SimplifyDemandedInstructionBits(CI)) return &CI; + // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { + Constant *One = ConstantInt::get(SrcTy, 1); + Src = Builder.CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); - if (DestTy->isIntegerTy()) { - // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). - // TODO: We canonicalize to more instructions here because we are probably - // lacking equivalent analysis for trunc relative to icmp. There may also - // be codegen concerns. If those trunc limitations were removed, we could - // remove this transform. - Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } - - // For vectors, we do not canonicalize all truncs to icmp, so optimize - // patterns that would be covered within visitICmpInst. - Value *X; - const APInt *C; - if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) { - // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 - APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C); - Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } - if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)), - m_Deferred(X))))) { - // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 - APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1; - Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } + return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); } // FIXME: Maybe combine the next two transforms to handle the no cast case diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index bf8bc88..07bd98b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1609,13 +1609,6 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And, const APInt &C1) { - // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 - // TODO: We canonicalize to the longer form for scalars because we have - // better analysis/folds for icmp, and codegen may be better with icmp. - if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() && - C1.isNullValue() && match(And->getOperand(1), m_One())) - return new TruncInst(And->getOperand(0), Cmp.getType()); - const APInt *C2; if (!match(And->getOperand(1), m_APInt(C2))) return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 7258127..61a3e31 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1477,33 +1477,6 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, return SelectInst::Create(NarrowCond, NarrowX, NarrowY); } -/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. -static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { - Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); - if (!Shuf.isIdentityWithExtract() || !isa(Op1)) - return nullptr; - - Value *X, *Y; - Constant *Mask; - if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask)))) - return nullptr; - - // We are extracting a subvector from a shuffle. Remove excess elements from - // the 1st shuffle mask to eliminate the extract. - // shuf (shuf X, Y, ), undef, <0, undef, 2> --> - // shuf X, Y, - unsigned NumElts = Shuf.getType()->getVectorNumElements(); - SmallVector NewMask(NumElts); - for (unsigned i = 0; i != NumElts; ++i) { - // If the extracting shuffle has an undef mask element, it transfers to the - // new shuffle mask. Otherwise, copy the original mask element. - Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i); - Constant *MaskElt = Mask->getAggregateElement(i); - NewMask[i] = isa(ExtractMaskElt) ? ExtractMaskElt : MaskElt; - } - return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); -} - Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); @@ -1526,9 +1499,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return &SVI; } - if (Instruction *I = foldIdentityExtractShuffle(SVI)) - return I; - SmallVector Mask = SVI.getShuffleMask(); Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll index 3266fa6..fc56466 100644 --- a/llvm/test/Transforms/InstCombine/apint-shift.ll +++ b/llvm/test/Transforms/InstCombine/apint-shift.ll @@ -319,8 +319,8 @@ define i1 @test16(i84 %X) { define <2 x i1> @test16vec(<2 x i84> %X) { ; CHECK-LABEL: @test16vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[AND:%.*]] = and <2 x i84> %X, +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shr = ashr <2 x i84> %X, diff --git a/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll b/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll index 2241c88..c7d7d36 100644 --- a/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll +++ b/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll @@ -27,8 +27,9 @@ define i1 @test1(i799 %X, i799 %A) { define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) { ; CHECK-LABEL: @test0vec( -; CHECK-NEXT: [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]] -; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i39> , [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[D]] ; %B = lshr <2 x i39> %X, %A diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 1f05bb6..1f97009 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -2427,9 +2427,10 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) { define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_and_or_lshr_vec( -; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]] -; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> +; CHECK-NEXT: [[SHF1:%.*]] = shl nuw <2 x i32> , [[Y:%.*]] +; CHECK-NEXT: [[OR2:%.*]] = or <2 x i32> [[SHF1]], +; CHECK-NEXT: [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, %y @@ -2444,7 +2445,8 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) { ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], ; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]] ; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] -; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization @@ -2470,8 +2472,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) { define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, @@ -2484,8 +2486,10 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll index 8adb211..e9c3539 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll @@ -174,7 +174,8 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) { define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @extract_subvector_of_shuffle( -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> ; CHECK-NEXT: ret <2 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> @@ -193,7 +194,7 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> ; CHECK-NEXT: call void @use_v5i8(<5 x i8> [[SHUF]]) -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> diff --git a/llvm/test/Transforms/InstCombine/vector-casts.ll b/llvm/test/Transforms/InstCombine/vector-casts.ll index e0d6083..6e0d66b8 100644 --- a/llvm/test/Transforms/InstCombine/vector-casts.ll +++ b/llvm/test/Transforms/InstCombine/vector-casts.ll @@ -1,22 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -; Can't get smaller than this. +; This turns into a&1 != 0 +; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. +; This pattern does not appear to meet that standard. define <2 x i1> @trunc(<2 x i64> %a) { ; CHECK-LABEL: @trunc( -; CHECK-NEXT: [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[T]] ; %t = trunc <2 x i64> %a to <2 x i1> ret <2 x i1> %t } -; This is trunc. +; TODO: This could be just 1 instruction (trunc). define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc( -; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> +; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -24,11 +28,12 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ret <2 x i1> %r } -; This is trunc. +; TODO: This could be just 1 instruction (trunc). define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt( -; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> +; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -36,7 +41,7 @@ define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete. +; TODO: This could be just 1 instruction (trunc). define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts( diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 71038fe..8e94863 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -2901,45 +2901,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> -; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> -; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef) -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4 -; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer +; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4 ; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) -; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] -; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] -; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] -; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] -; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 -; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) +; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] +; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] +; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) +; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -2949,14 +2953,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null +; AVX-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -2994,45 +2998,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> -; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> -; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> -; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef) -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] -; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] -; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] -; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) +; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] +; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] +; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3042,14 +3050,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null +; AVX512-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3154,45 +3162,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> -; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> -; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4 -; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer +; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4 ; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] -; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] -; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] -; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] -; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 -; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] +; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] +; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) +; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3202,14 +3214,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null +; AVX-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3247,45 +3259,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> -; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> -; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> -; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] -; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] -; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] -; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] +; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] +; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3295,14 +3311,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null +; AVX512-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -- 2.7.4