From 47aaa99c0e1e28573bf24d95c5540005ee734531 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 18 Dec 2020 08:49:05 -0500 Subject: [PATCH] [VectorCombine] allow peeking through GEPs when creating a vector load This is an enhancement motivated by https://llvm.org/PR16739 (see D92858 for another). We can look through a GEP to find a base pointer that may be safe to use for a vector load. If so, then we shuffle (shift) the necessary vector element over to index 0. Alive2 proof based on 1 of the regression tests: https://alive2.llvm.org/ce/z/yPJLkh The vector translation is independent of endian (verify by changing to leading 'E' in the datalayout string). Differential Revision: https://reviews.llvm.org/D93229 --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 54 +++++++++++++++--- llvm/test/Transforms/VectorCombine/X86/load.ll | 73 ++++++++++++++++--------- 2 files changed, 95 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 8e34161..a865f88 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -93,6 +93,7 @@ static void replaceValue(Value &Old, Value &New) { bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. + // TODO: Handle non-zero insert index. auto *Ty = dyn_cast(I.getType()); Value *Scalar; if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || @@ -115,7 +116,6 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { mustSuppressSpeculation(*Load)) return false; - // TODO: Extend this to match GEP with constant offsets. const DataLayout &DL = I.getModule()->getDataLayout(); Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); assert(isa(SrcPtr->getType()) && "Expected a pointer type"); @@ -127,10 +127,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (AS != SrcPtr->getType()->getPointerAddressSpace()) SrcPtr = Load->getPointerOperand(); + // We are potentially transforming byte-sized (8-bit) memory accesses, so make + // sure we have all of our type-based constraints in place for this target. Type *ScalarTy = Scalar->getType(); uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); - if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0) + if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || + ScalarSize % 8 != 0) return false; // Check safety of replacing the scalar load with a larger vector load. @@ -139,12 +142,45 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // we may use a larger value based on alignment attributes. unsigned MinVecNumElts = MinVectorSize / ScalarSize; auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); - if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) - return false; + unsigned OffsetEltIndex = 0; + Align Alignment = Load->getAlign(); + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { + // It is not safe to load directly from the pointer, but we can still peek + // through gep offsets and check if it safe to load from a base address with + // updated alignment. If it is, we can shuffle the element(s) into place + // after loading. + unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); + APInt Offset(OffsetBitWidth, 0); + SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // We want to shuffle the result down from a high element of a vector, so + // the offset must be positive. + if (Offset.isNegative()) + return false; + + // The offset must be a multiple of the scalar element to shuffle cleanly + // in the element's size. + uint64_t ScalarSizeInBytes = ScalarSize / 8; + if (Offset.urem(ScalarSizeInBytes) != 0) + return false; + + // If we load MinVecNumElts, will our target element still be loaded? + OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); + if (OffsetEltIndex >= MinVecNumElts) + return false; + + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) + return false; + + // Update alignment with offset value. Note that the offset could be negated + // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but + // negation does not change the result of the alignment calculation. + Alignment = commonAlignment(Alignment, Offset.getZExtValue()); + } // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 // Use the greater of the alignment on the load or its source pointer. - Align Alignment = std::max(SrcPtr->getPointerAlignment(DL), Load->getAlign()); + Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); Type *LoadTy = Load->getType(); int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); @@ -153,6 +189,9 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // New pattern: load VecPtr int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); + // Optionally, we are shuffling the loaded vector element(s) into place. + if (OffsetEltIndex) + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -168,12 +207,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Set everything but element 0 to undef to prevent poison from propagating // from the extra loaded memory. This will also optionally shrink/grow the // vector from the loaded size to the output size. - // We assume this operation has no cost in codegen. + // We assume this operation has no cost in codegen if there was no offset. // Note that we could use freeze to avoid poison problems, but then we might // still need a shuffle to change the vector size. unsigned OutputNumElts = Ty->getNumElements(); SmallVector Mask(OutputNumElts, UndefMaskElem); - Mask[0] = 0; + assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); + Mask[0] = OffsetEltIndex; VecLd = Builder.CreateShuffleVector(VecLd, Mask); replaceValue(I, *VecLd); diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll index 6b4fe43..1665c5d 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -269,14 +269,19 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl ret <8 x i16> %r } -; Negative test - can't safely load the offset vector, but could load+shuffle. +; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) { -; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; CHECK-NEXT: ret <8 x i16> [[R]] +; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 +; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; SSE2-NEXT: ret <8 x i16> [[R]] +; +; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16 +; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; AVX2-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 2 @@ -284,14 +289,19 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere ret <8 x i16> %r } -; TODO: Verify that alignment of the new load is not over-specified. +; Verify that alignment of the new load is not over-specified. define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) { -; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; CHECK-NEXT: ret <8 x i16> [[R]] +; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8 +; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; SSE2-NEXT: ret <8 x i16> [[R]] +; +; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2 +; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; AVX2-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 8 @@ -299,6 +309,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 ret <8 x i16> %r } +; Negative test - if we are shuffling a load from the base pointer, the address offset +; must be a multiple of element size. +; TODO: Could bitcast around this limitation. + define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) { ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1 @@ -316,10 +330,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 derefer define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12 -; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[GEP]] to i32* -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12 @@ -329,6 +342,10 @@ define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 derefe ret <4 x i32> %r } +; Negative test - if we are shuffling a load from the base pointer, the address offset +; must be a multiple of element size and the offset must be low enough to fit in the vector +; (bitcasting would not help this case). + define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) { ; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13 @@ -608,15 +625,21 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16 ret <8 x i32> %r } -; TODO: Can't safely load the offset vector, but can load+shuffle if it is profitable. +; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 -; CHECK-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8 -; CHECK-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; CHECK-NEXT: ret <8 x i16> [[R]] +; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 +; SSE2-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8 +; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0 +; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; SSE2-NEXT: ret <8 x i16> [[R]] +; +; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 +; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> +; AVX2-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1 %l = load <2 x i16>, <2 x i16>* %gep, align 8 -- 2.7.4