InstructionWorklist Worklist;
bool vectorizeLoadInsert(Instruction &I);
+ bool widenSubvectorLoad(Instruction &I);
ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
ExtractElementInst *Ext1,
unsigned PreferredExtractIndex) const;
return true;
}
+/// If we are loading a vector and then inserting it into a larger vector with
+/// undefined elements, try to load the larger vector and eliminate the insert.
+/// This removes a shuffle in IR and may allow combining of other loaded values.
+bool VectorCombine::widenSubvectorLoad(Instruction &I) {
+ // Match subvector insert of fixed vector.
+ auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(&I);
+ if (!Ty || !Shuf || !Shuf->isIdentityWithPadding())
+ return false;
+
+ // Allow a non-canonical shuffle mask that is choosing elements from op1.
+ unsigned NumOpElts =
+ cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
+ unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
+ return M >= (int)(NumOpElts);
+ });
+
+ auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
+ if (!canWidenLoad(Load, TTI))
+ return false;
+
+ // We use minimal alignment (maximum flexibility) because we only care about
+ // the dereferenceable region. When calculating cost and creating a new op,
+ // we may use a larger value based on alignment attributes.
+ const DataLayout &DL = I.getModule()->getDataLayout();
+ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
+ assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
+ Align Alignment = Load->getAlign();
+ if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT))
+ return false;
+
+ Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+ Type *LoadTy = Load->getType();
+ unsigned AS = Load->getPointerAddressSpace();
+
+ // Original pattern: insert_subvector (load PtrOp)
+ // This conservatively assumes that the cost of a subvector insert into an
+ // undef value is 0. We could add that cost if the cost model accurately
+ // reflects the real cost of that operation.
+ InstructionCost OldCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+
+ // New pattern: load PtrOp
+ InstructionCost NewCost =
+ TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
+
+ // We can aggressively convert to the vector form because the backend can
+ // invert this transform if it does not result in a performance win.
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ IRBuilder<> Builder(Load);
+ Value *CastedPtr =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS));
+ Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
+ replaceValue(I, *VecLd);
+ ++NumVecLoad;
+ return true;
+}
+
/// Determine which, if any, of the inputs should be replaced by a shuffle
/// followed by extract from a different index.
ExtractElementInst *VectorCombine::getShuffleExtract(
Builder.SetInsertPoint(&I);
if (!ScalarizationOnly) {
MadeChange |= vectorizeLoadInsert(I);
+ MadeChange |= widenSubvectorLoad(I);
MadeChange |= foldExtractExtract(I);
MadeChange |= foldInsExtFNeg(I);
MadeChange |= foldBitcastShuf(I);
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,AVX
;-------------------------------------------------------------------------------
; Here we know we can load 128 bits as per dereferenceability and alignment.
ret <2 x i4> %r
}
+; Load the 128-bit vector because there is no additional cost.
+
define <4 x float> @load_v1f32_v4f32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v1f32_v4f32(
-; CHECK-NEXT: [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
; CHECK-NEXT: ret <4 x float> [[S]]
;
%l = load <1 x float>, ptr %p, align 16
ret <4 x float> %s
}
+; Load the 128-bit vector because there is no additional cost.
+; Alignment is taken from param attr.
+
define <4 x float> @load_v2f32_v4f32(ptr align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2f32_v4f32(
-; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
; CHECK-NEXT: ret <4 x float> [[S]]
;
%l = load <2 x float>, ptr %p, align 1
ret <4 x float> %s
}
+; Load the 128-bit vector because there is no additional cost.
+
define <4 x float> @load_v3f32_v4f32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v3f32_v4f32(
-; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 1
; CHECK-NEXT: ret <4 x float> [[S]]
;
%l = load <3 x float>, ptr %p, align 1
ret <4 x float> %s
}
+; Negative test - the shuffle must be a simple subvector insert.
+
define <4 x float> @load_v3f32_v4f32_wrong_mask(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v3f32_v4f32_wrong_mask(
; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
ret <4 x float> %s
}
+; Negative test - must be dereferenceable to vector width.
+
define <4 x float> @load_v3f32_v4f32_not_deref(ptr dereferenceable(15) %p) {
; CHECK-LABEL: @load_v3f32_v4f32_not_deref(
; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16
ret <4 x float> %s
}
+; Without AVX, the cost of loading 256-bits would be greater.
+
define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
-; CHECK-LABEL: @load_v2f32_v8f32(
-; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: ret <8 x float> [[S]]
+; SSE-LABEL: @load_v2f32_v8f32(
+; SSE-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
+; SSE-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT: ret <8 x float> [[S]]
+;
+; AVX-LABEL: @load_v2f32_v8f32(
+; AVX-NEXT: [[S:%.*]] = load <8 x float>, ptr [[P:%.*]], align 1
+; AVX-NEXT: ret <8 x float> [[S]]
;
%l = load <2 x float>, ptr %p, align 1
%s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %s
}
+; Integer type is ok too.
+
define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32(
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
ret <4 x i32> %s
}
+; TODO: We assumed the shuffle mask is canonical.
+
define <4 x i32> @load_v2i32_v4i32_non_canonical_mask(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask(
; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
ret <4 x i32> %s
}
+; Allow non-canonical commuted shuffle.
+
define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute(
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
ret <4 x i32> %s
}
+; The wide load must be in the same addrspace as the original load.
+
define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32_addrspacecast(
-; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42)
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr addrspace(42) [[ASC]], align 4
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42)
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%asc = addrspacecast ptr addrspace(5) %p to ptr addrspace(42)