And another attempt to start untangling this ball of threads around gather.
There's `TTI::prefersVectorizedAddressing()`hoop, which confusingly defaults to `true`,
which tells LV to try to vectorize the addresses that lead to loads,
but X86 generally can not deal with vectors of addresses,
the only instructions that support that are GATHER/SCATTER,
but even those aren't available until AVX2, and aren't really usable until AVX512.
This specializes the hook for X86, to return true only if we have AVX512 or AVX2 w/ fast gather.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D111546
if (Index == -1U && (Opcode == Instruction::ExtractElement ||
Opcode == Instruction::InsertElement)) {
// TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
- // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
+ // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
// TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
return isLegalMaskedExpandLoad(DataTy);
}
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+bool X86TTIImpl::supportsGather() const {
// Some CPUs have better gather performance than others.
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.
- if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
+ return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+ if (!supportsGather())
return false;
// This function is called now in two cases: from the Loop Vectorizer
return Options;
}
+bool X86TTIImpl::prefersVectorizedAddressing() const {
+ return supportsGather();
+}
+
+bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
+ return false;
+}
+
bool X86TTIImpl::enableInterleavedAccessVectorization() {
// TODO: We expect this to be beneficial regardless of arch,
// but there are currently some unexplained performance artifacts on Atom.
SmallPtrSetImpl<Argument *> &Args) const;
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
+ bool prefersVectorizedAddressing() const;
+ bool supportsEfficientVectorElementLoadStore() const;
bool enableInterleavedAccessVectorization();
private:
+ bool supportsGather() const;
InstructionCost getGSScalarCost(unsigned Opcode, Type *DataTy,
bool VariableMask, Align Alignment,
unsigned AddressSpace);
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
;
; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 194 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 388 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2
;
; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 194 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 388 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2
;
; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2
; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
;
; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 102 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 204 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 196 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 392 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4
;
; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 440 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 98 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 196 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 392 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4
;
; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4
; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 50 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 100 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 200 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
;
; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 102 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 204 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 100 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 200 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 400 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8
;
; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 50 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 100 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 200 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 400 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8
;
; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8
; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 103 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 207 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
;
; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 103 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 207 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 386 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1
;
; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 386 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1
;
; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1
; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1
; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; SSE2: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; SSE2: LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load float, float* %in0, align 4
+; SSE2: LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load float, float* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load float, float* %in0, align 4
+; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load float, float* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load float, float* %in0, align 4
-;
+; AVX2: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load float, float* %in0, align 4
+
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load float, float* %in0, align 4
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load float, float* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction: %v0 = load float, float* %in0, align 4
; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction: %v0 = load float, float* %in0, align 4
+; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction: %v0 = load float, float* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load float, float* %in0, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load float, float* %in0, align 4
+; AVX2: LV: Found an estimated cost of 304 for VF 32 For instruction: %v0 = load float, float* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, float* %in0, align 4
; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, double* %in0, align 8
; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load double, double* %in0, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load double, double* %in0, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction: %v0 = load double, double* %in0, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: %v0 = load double, double* %in0, align 8
;;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, double* %in0, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 32 For instruction: %v0 = load double, double* %in0, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, double* %in0, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, double* %in0, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction: %v0 = load double, double* %in0, align 8
; AVX1: LV: Found an estimated cost of 64 for VF 8 For instruction: %v0 = load double, double* %in0, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 16 For instruction: %v0 = load double, double* %in0, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, double* %in0, align 8
; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, double* %in0, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 16 For instruction: %v0 = load double, double* %in0, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, double* %in0, align 8
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load double, double* %in0, align 8
; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
+; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 430 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
+; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 516 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 516 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX512: LV: Found an estimated cost of 13 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; SSE2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 124 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 47 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 94 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 188 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 188 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 100 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 69 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 276 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 276 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 22 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 192 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 192 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 52 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 104 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 104 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 92 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 184 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 368 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 368 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i64, i64* %in0, align 8
; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load i64, i64* %in0, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i64, i64* %in0, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 26 for VF 4 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 104 for VF 16 For instruction: %v0 = load i64, i64* %in0, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 32 For instruction: %v0 = load i64, i64* %in0, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i64, i64* %in0, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 32 For instruction: %v0 = load i64, i64* %in0, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i64, i64* %in0, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i64, i64* %in0, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction: %v0 = load i64, i64* %in0, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 16 For instruction: %v0 = load i64, i64* %in0, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load i64, i64* %in0, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 16 For instruction: %v0 = load i64, i64* %in0, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, i64* %in0, align 8
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i64, i64* %in0, align 8
; SSE2: LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v2, float* %out2, align 4
; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: store float %v2, float* %out2, align 4
; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: store float %v2, float* %out2, align 4
+; SSE2: LV: Found an estimated cost of 96 for VF 16 For instruction: store float %v2, float* %out2, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, float* %out2, align 4
; AVX1: LV: Found an estimated cost of 13 for VF 2 For instruction: store float %v2, float* %out2, align 4
; AVX1: LV: Found an estimated cost of 23 for VF 4 For instruction: store float %v2, float* %out2, align 4
; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction: store float %v2, float* %out2, align 4
; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: store float %v2, float* %out2, align 4
+; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: store float %v2, float* %out2, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, float* %out2, align 4
; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v2, float* %out2, align 4
; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: store float %v2, float* %out2, align 4
; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: store float %v2, float* %out2, align 4
; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: store float %v2, float* %out2, align 4
+; AVX2: LV: Found an estimated cost of 228 for VF 32 For instruction: store float %v2, float* %out2, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, float* %out2, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store float %v2, float* %out2, align 4
; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v3, float* %out3, align 4
; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: store float %v3, float* %out3, align 4
; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: store float %v3, float* %out3, align 4
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: store float %v3, float* %out3, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, float* %out3, align 4
; AVX1: LV: Found an estimated cost of 13 for VF 2 For instruction: store float %v3, float* %out3, align 4
; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction: store float %v3, float* %out3, align 4
; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction: store float %v3, float* %out3, align 4
; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction: store float %v3, float* %out3, align 4
+; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction: store float %v3, float* %out3, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, float* %out3, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: store float %v3, float* %out3, align 4
; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store float %v3, float* %out3, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: store float %v3, float* %out3, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: store float %v3, float* %out3, align 4
+; AVX2: LV: Found an estimated cost of 304 for VF 32 For instruction: store float %v3, float* %out3, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, float* %out3, align 4
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store float %v3, float* %out3, align 4
; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, double* %out1, align 8
; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v1, double* %out1, align 8
; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction: store double %v1, double* %out1, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction: store double %v1, double* %out1, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, double* %out1, align 8
; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, double* %out1, align 8
; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction: store double %v1, double* %out1, align 8
; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction: store double %v1, double* %out1, align 8
; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction: store double %v1, double* %out1, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: store double %v1, double* %out1, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, double* %out1, align 8
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store double %v1, double* %out1, align 8
; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: store double %v1, double* %out1, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v1, double* %out1, align 8
; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v1, double* %out1, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 32 For instruction: store double %v1, double* %out1, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, double* %out1, align 8
; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store double %v1, double* %out1, align 8
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, double* %out3, align 8
; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, double* %out3, align 8
; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: store double %v3, double* %out3, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: store double %v3, double* %out3, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, double* %out3, align 8
; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, double* %out3, align 8
; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction: store double %v3, double* %out3, align 8
; AVX1: LV: Found an estimated cost of 64 for VF 8 For instruction: store double %v3, double* %out3, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 16 For instruction: store double %v3, double* %out3, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, double* %out3, align 8
; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: store double %v3, double* %out3, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v3, double* %out3, align 8
; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: store double %v3, double* %out3, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 16 For instruction: store double %v3, double* %out3, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, double* %out3, align 8
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store double %v3, double* %out3, align 8
; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2
; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2
; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2
+; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2
; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2
; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2
; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2
; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2
+; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction: store i16 %v4, i16* %out4, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2
; AVX2: LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2
; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2
; AVX2: LV: Found an estimated cost of 88 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2
; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2
+; AVX2: LV: Found an estimated cost of 430 for VF 32 For instruction: store i16 %v4, i16* %out4, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2
; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2
; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2
; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction: store i16 %v5, i16* %out5, align 2
; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2
+; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction: store i16 %v5, i16* %out5, align 2
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, i16* %out5, align 2
; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2
; AVX1: LV: Found an estimated cost of 53 for VF 4 For instruction: store i16 %v5, i16* %out5, align 2
; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2
; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction: store i16 %v5, i16* %out5, align 2
+; AVX1: LV: Found an estimated cost of 516 for VF 32 For instruction: store i16 %v5, i16* %out5, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, i16* %out5, align 2
; AVX2: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2
; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v5, i16* %out5, align 2
; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2
; AVX2: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v5, i16* %out5, align 2
+; AVX2: LV: Found an estimated cost of 516 for VF 32 For instruction: store i16 %v5, i16* %out5, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, i16* %out5, align 2
; AVX512: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2
; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction: store i32 %v2, i32* %out2, align 4
; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: store i32 %v2, i32* %out2, align 4
; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction: store i32 %v2, i32* %out2, align 4
+; SSE2: LV: Found an estimated cost of 192 for VF 16 For instruction: store i32 %v2, i32* %out2, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, i32* %out2, align 4
; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction: store i32 %v2, i32* %out2, align 4
; AVX1: LV: Found an estimated cost of 29 for VF 4 For instruction: store i32 %v2, i32* %out2, align 4
; AVX1: LV: Found an estimated cost of 69 for VF 8 For instruction: store i32 %v2, i32* %out2, align 4
; AVX1: LV: Found an estimated cost of 138 for VF 16 For instruction: store i32 %v2, i32* %out2, align 4
+; AVX1: LV: Found an estimated cost of 276 for VF 32 For instruction: store i32 %v2, i32* %out2, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, i32* %out2, align 4
; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v2, i32* %out2, align 4
; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: store i32 %v2, i32* %out2, align 4
; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: store i32 %v2, i32* %out2, align 4
; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: store i32 %v2, i32* %out2, align 4
+; AVX2: LV: Found an estimated cost of 276 for VF 32 For instruction: store i32 %v2, i32* %out2, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, i32* %out2, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %v2, i32* %out2, align 4
; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i32 %v3, i32* %out3, align 4
; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: store i32 %v3, i32* %out3, align 4
; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction: store i32 %v3, i32* %out3, align 4
+; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction: store i32 %v3, i32* %out3, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, i32* %out3, align 4
; AVX1: LV: Found an estimated cost of 19 for VF 2 For instruction: store i32 %v3, i32* %out3, align 4
; AVX1: LV: Found an estimated cost of 38 for VF 4 For instruction: store i32 %v3, i32* %out3, align 4
; AVX1: LV: Found an estimated cost of 92 for VF 8 For instruction: store i32 %v3, i32* %out3, align 4
; AVX1: LV: Found an estimated cost of 184 for VF 16 For instruction: store i32 %v3, i32* %out3, align 4
+; AVX1: LV: Found an estimated cost of 368 for VF 32 For instruction: store i32 %v3, i32* %out3, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, i32* %out3, align 4
; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %v3, i32* %out3, align 4
; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %v3, i32* %out3, align 4
; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %v3, i32* %out3, align 4
; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %v3, i32* %out3, align 4
+; AVX2: LV: Found an estimated cost of 368 for VF 32 For instruction: store i32 %v3, i32* %out3, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, i32* %out3, align 4
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %v3, i32* %out3, align 4
; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v1, i64* %out1, align 8
; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: store i64 %v1, i64* %out1, align 8
; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: store i64 %v1, i64* %out1, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: store i64 %v1, i64* %out1, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, i64* %out1, align 8
; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction: store i64 %v1, i64* %out1, align 8
; AVX1: LV: Found an estimated cost of 26 for VF 4 For instruction: store i64 %v1, i64* %out1, align 8
; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction: store i64 %v1, i64* %out1, align 8
; AVX1: LV: Found an estimated cost of 104 for VF 16 For instruction: store i64 %v1, i64* %out1, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 32 For instruction: store i64 %v1, i64* %out1, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, i64* %out1, align 8
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store i64 %v1, i64* %out1, align 8
; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: store i64 %v1, i64* %out1, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v1, i64* %out1, align 8
; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v1, i64* %out1, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 32 For instruction: store i64 %v1, i64* %out1, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, i64* %out1, align 8
; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %v1, i64* %out1, align 8
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, i64* %out3, align 8
; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i64 %v3, i64* %out3, align 8
; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %v3, i64* %out3, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %v3, i64* %out3, align 8
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, i64* %out3, align 8
; AVX1: LV: Found an estimated cost of 22 for VF 2 For instruction: store i64 %v3, i64* %out3, align 8
; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction: store i64 %v3, i64* %out3, align 8
; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction: store i64 %v3, i64* %out3, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 16 For instruction: store i64 %v3, i64* %out3, align 8
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, i64* %out3, align 8
; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %v3, i64* %out3, align 8
; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: store i64 %v3, i64* %out3, align 8
; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: store i64 %v3, i64* %out3, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 16 For instruction: store i64 %v3, i64* %out3, align 8
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, i64* %out3, align 8
; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %v3, i64* %out3, align 8
}
; This function uses a stride that is generally too big to benefit from vectorization without
-; really good support for a gather load. We were not computing an accurate cost for the
-; vectorization and subsequent scalarization of the pointer induction variables.
+; really good support for a gather load. But if we don't vectorize the pointer induction,
+; then we don't need to extract the pointers out of vector of pointers,
+; and the vectorization becomes profitable.
define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
; CHECK-LABEL: @PR27826(
; CHECK-NEXT: br i1 [[CMP]], label [[PREHEADER:%.*]], label [[FOR_END:%.*]]
; CHECK: preheader:
; CHECK-NEXT: [[T0:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[T0]], -1
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 5
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 128
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 160
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 192
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 224
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 256
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 288
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 320
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 352
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 384
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 416
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 448
+; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 480
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP35:%.*]] = load float, float* [[TMP19]], align 4
+; CHECK-NEXT: [[TMP36:%.*]] = load float, float* [[TMP20]], align 4
+; CHECK-NEXT: [[TMP37:%.*]] = load float, float* [[TMP21]], align 4
+; CHECK-NEXT: [[TMP38:%.*]] = load float, float* [[TMP22]], align 4
+; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP35]], i32 0
+; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i32 1
+; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 2
+; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 3
+; CHECK-NEXT: [[TMP43:%.*]] = load float, float* [[TMP23]], align 4
+; CHECK-NEXT: [[TMP44:%.*]] = load float, float* [[TMP24]], align 4
+; CHECK-NEXT: [[TMP45:%.*]] = load float, float* [[TMP25]], align 4
+; CHECK-NEXT: [[TMP46:%.*]] = load float, float* [[TMP26]], align 4
+; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x float> poison, float [[TMP43]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i32 1
+; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i32 2
+; CHECK-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP46]], i32 3
+; CHECK-NEXT: [[TMP51:%.*]] = load float, float* [[TMP27]], align 4
+; CHECK-NEXT: [[TMP52:%.*]] = load float, float* [[TMP28]], align 4
+; CHECK-NEXT: [[TMP53:%.*]] = load float, float* [[TMP29]], align 4
+; CHECK-NEXT: [[TMP54:%.*]] = load float, float* [[TMP30]], align 4
+; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x float> poison, float [[TMP51]], i32 0
+; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x float> [[TMP55]], float [[TMP52]], i32 1
+; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 2
+; CHECK-NEXT: [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 3
+; CHECK-NEXT: [[TMP59:%.*]] = load float, float* [[TMP31]], align 4
+; CHECK-NEXT: [[TMP60:%.*]] = load float, float* [[TMP32]], align 4
+; CHECK-NEXT: [[TMP61:%.*]] = load float, float* [[TMP33]], align 4
+; CHECK-NEXT: [[TMP62:%.*]] = load float, float* [[TMP34]], align 4
+; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x float> poison, float [[TMP59]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 1
+; CHECK-NEXT: [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 2
+; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x float> [[TMP65]], float [[TMP62]], i32 3
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP83:%.*]] = load float, float* [[TMP67]], align 4
+; CHECK-NEXT: [[TMP84:%.*]] = load float, float* [[TMP68]], align 4
+; CHECK-NEXT: [[TMP85:%.*]] = load float, float* [[TMP69]], align 4
+; CHECK-NEXT: [[TMP86:%.*]] = load float, float* [[TMP70]], align 4
+; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x float> poison, float [[TMP83]], i32 0
+; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 1
+; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 2
+; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x float> [[TMP89]], float [[TMP86]], i32 3
+; CHECK-NEXT: [[TMP91:%.*]] = load float, float* [[TMP71]], align 4
+; CHECK-NEXT: [[TMP92:%.*]] = load float, float* [[TMP72]], align 4
+; CHECK-NEXT: [[TMP93:%.*]] = load float, float* [[TMP73]], align 4
+; CHECK-NEXT: [[TMP94:%.*]] = load float, float* [[TMP74]], align 4
+; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x float> poison, float [[TMP91]], i32 0
+; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 1
+; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 2
+; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x float> [[TMP97]], float [[TMP94]], i32 3
+; CHECK-NEXT: [[TMP99:%.*]] = load float, float* [[TMP75]], align 4
+; CHECK-NEXT: [[TMP100:%.*]] = load float, float* [[TMP76]], align 4
+; CHECK-NEXT: [[TMP101:%.*]] = load float, float* [[TMP77]], align 4
+; CHECK-NEXT: [[TMP102:%.*]] = load float, float* [[TMP78]], align 4
+; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x float> poison, float [[TMP99]], i32 0
+; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x float> [[TMP103]], float [[TMP100]], i32 1
+; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x float> [[TMP104]], float [[TMP101]], i32 2
+; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x float> [[TMP105]], float [[TMP102]], i32 3
+; CHECK-NEXT: [[TMP107:%.*]] = load float, float* [[TMP79]], align 4
+; CHECK-NEXT: [[TMP108:%.*]] = load float, float* [[TMP80]], align 4
+; CHECK-NEXT: [[TMP109:%.*]] = load float, float* [[TMP81]], align 4
+; CHECK-NEXT: [[TMP110:%.*]] = load float, float* [[TMP82]], align 4
+; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x float> poison, float [[TMP107]], i32 0
+; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x float> [[TMP111]], float [[TMP108]], i32 1
+; CHECK-NEXT: [[TMP113:%.*]] = insertelement <4 x float> [[TMP112]], float [[TMP109]], i32 2
+; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x float> [[TMP113]], float [[TMP110]], i32 3
+; CHECK-NEXT: [[TMP115:%.*]] = fadd fast <4 x float> [[TMP42]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <4 x float> [[TMP50]], [[VEC_PHI1]]
+; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <4 x float> [[TMP58]], [[VEC_PHI2]]
+; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <4 x float> [[TMP66]], [[VEC_PHI3]]
+; CHECK-NEXT: [[TMP119]] = fadd fast <4 x float> [[TMP115]], [[TMP90]]
+; CHECK-NEXT: [[TMP120]] = fadd fast <4 x float> [[TMP116]], [[TMP98]]
+; CHECK-NEXT: [[TMP121]] = fadd fast <4 x float> [[TMP117]], [[TMP106]]
+; CHECK-NEXT: [[TMP122]] = fadd fast <4 x float> [[TMP118]], [[TMP114]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP123:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP120]], [[TMP119]]
+; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]]
+; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]]
+; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX5]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR:%.*]]
; CHECK: for:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT: [[S_02:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD4:%.*]], [[FOR]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ]
+; CHECK-NEXT: [[S_02:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[T1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[T2:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[T1]], [[S_02]]
; CHECK-NEXT: [[ADD4]] = fadd fast float [[ADD]], [[T2]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]]
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: loopexit:
-; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ]
+; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD4_LCSSA]], [[LOOPEXIT]] ]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD1]] to <4 x i64>
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT: store i32 [[TMP13]], i32* [[TMP6]], align 4, !llvm.access.group !1
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP8]], align 4, !llvm.access.group !1
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4, !llvm.access.group !1
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP12]], align 4, !llvm.access.group !1
-; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD2]], <4 x i32>* [[TMP20]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT: store i32 [[TMP21]], i32* [[TMP17]], align 4, !llvm.access.group !1
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT: store i32 [[TMP22]], i32* [[TMP18]], align 4, !llvm.access.group !1
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT: store i32 [[TMP23]], i32* [[TMP19]], align 4, !llvm.access.group !1
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: store i32 [[TMP24]], i32* [[TMP20]], align 4, !llvm.access.group !1
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !llvm.access.group !0
+; CHECK-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD1]], <4 x i32>* [[TMP28]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[TMP11]], align 1
-; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[TMP13]], align 1
-; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[TMP15]], align 1
-; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP17]], align 1
-; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i8> poison, i8 [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 1
-; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP28]], i32 2
-; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i8> [[TMP32]], i8 [[TMP29]], i32 3
-; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP19]], align 1
-; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP21]], align 1
-; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* [[TMP23]], align 1
-; CHECK-NEXT: [[TMP37:%.*]] = load i8, i8* [[TMP25]], align 1
-; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i8> poison, i8 [[TMP34]], i32 0
-; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 1
-; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 2
-; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i8> [[TMP40]], i8 [[TMP37]], i32 3
-; CHECK-NEXT: [[TMP42:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT: [[TMP43:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT: [[TMP44:%.*]] = trunc <4 x i64> [[TMP42]] to <4 x i8>
-; CHECK-NEXT: [[TMP45:%.*]] = trunc <4 x i64> [[TMP43]] to <4 x i8>
-; CHECK-NEXT: [[TMP46:%.*]] = lshr <4 x i8> [[TMP33]], [[TMP44]]
-; CHECK-NEXT: [[TMP47:%.*]] = lshr <4 x i8> [[TMP41]], [[TMP45]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <4 x i8> [[TMP46]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP49:%.*]] = and <4 x i8> [[TMP47]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP48]] to <4 x i32>
-; CHECK-NEXT: [[TMP51:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT: [[TMP52]] = add <4 x i32> [[VEC_PHI]], [[TMP50]]
-; CHECK-NEXT: [[TMP53]] = add <4 x i32> [[VEC_PHI2]], [[TMP51]]
+; CHECK-NEXT: [[TMP8:%.*]] = udiv i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[TMP1]], 8
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 [[TMP2]], 8
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = udiv i64 [[TMP4]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i64 [[TMP5]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP6]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = udiv i64 [[TMP7]], 8
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP17]], align 1
+; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[TMP18]], align 1
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[TMP19]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i8> poison, i8 [[TMP24]], i32 0
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i8> [[TMP28]], i8 [[TMP25]], i32 1
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i8> [[TMP29]], i8 [[TMP26]], i32 2
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 3
+; CHECK-NEXT: [[TMP32:%.*]] = load i8, i8* [[TMP20]], align 1
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[TMP21]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP22]], align 1
+; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP23]], align 1
+; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
+; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
+; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
+; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 3
+; CHECK-NEXT: [[TMP40:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT: [[TMP41:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT: [[TMP42:%.*]] = trunc <4 x i64> [[TMP40]] to <4 x i8>
+; CHECK-NEXT: [[TMP43:%.*]] = trunc <4 x i64> [[TMP41]] to <4 x i8>
+; CHECK-NEXT: [[TMP44:%.*]] = lshr <4 x i8> [[TMP31]], [[TMP42]]
+; CHECK-NEXT: [[TMP45:%.*]] = lshr <4 x i8> [[TMP39]], [[TMP43]]
+; CHECK-NEXT: [[TMP46:%.*]] = and <4 x i8> [[TMP44]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP47:%.*]] = and <4 x i8> [[TMP45]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP46]] to <4 x i32>
+; CHECK-NEXT: [[TMP49:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
+; CHECK-NEXT: [[TMP50]] = add <4 x i32> [[VEC_PHI]], [[TMP48]]
+; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[VEC_PHI2]], [[TMP49]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]]
-; CHECK-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
+; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: loop_exit:
-; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]]
;
entry: