From 435c4e0792f1b412cfd70a8ac44a2f12e800c0e9 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2012 08:40:30 +0000 Subject: [PATCH] First major step toward addressing PR14059. This teaches SROA to handle cases where we have partial integer loads and stores to an otherwise promotable alloca to widen[1] those loads and stores to cover the entire alloca and bitcast them into the appropriate type such that promotion can proceed. These partial loads and stores stem from an annoying confluence of ARM's calling convention and ABI lowering and the FCA pre-splitting which takes place in SROA. Clang lowers a { double, double } in-register function argument as a [4 x i32] function argument to ensure it is placed into integer 32-bit registers (a really unnerving implicit contract between Clang and the ARM backend I would add). This results in a FCA load of [4 x i32]* from the { double, double } alloca, and SROA decomposes this into a sequence of i32 loads and stores. Inlining proceeds, code gets folded, but at the end of the day, we still have i32 stores to the low and high halves of a double alloca. Widening these to be i64 operations, and bitcasting them to double prior to loading or storing allows promotion to proceed for these allocas. I looked quite a bit changing the IR which Clang produces for this case to be more friendly, but small changes seem unlikely to help. I think the best representation we could use currently would be to pass 4 i32 arguments thereby avoiding any FCAs, but that would still require this fix. It seems like it might eventually be nice to somehow encode the ABI register selection choices outside of the parameter type system so that the parameter can be a { double, double }, but the CC register annotations indicate that this should be passed via 4 integer registers. This patch does not address the second problem in PR14059, which is the reverse: when a struct alloca is loaded as a *larger* single integer. This patch also does not address some of the code quality issues with the FCA-splitting. Those don't actually impede any optimizations really, but they're on my list to clean up. [1]: Pedantic footnote: for those concerned about memory model issues here, this is safe. For the alloca to be promotable, it cannot escape or have any use of its address that could allow these loads or stores to be racing. Thus, widening is always safe. llvm-svn: 165928 --- llvm/lib/Transforms/Scalar/SROA.cpp | 146 ++++++++++++++++++---------- llvm/test/Transforms/SROA/basictest.ll | 38 +++++++- llvm/test/Transforms/SROA/phi-and-select.ll | 42 ++++---- 3 files changed, 150 insertions(+), 76 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 4a3735d..a2267d0 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2120,47 +2120,74 @@ static bool isVectorPromotionViable(const DataLayout &TD, return true; } -/// \brief Test whether the given alloca partition can be promoted to an int. +/// \brief Test whether the given alloca partition's integer operations can be +/// widened to promotable ones. /// -/// This is a quick test to check whether we can rewrite a particular alloca -/// partition (and its newly formed alloca) into an integer alloca suitable for -/// promotion to an SSA value. We only can ensure this for a limited set of -/// operations, and we don't want to do the rewrites unless we are confident -/// that the result will be promotable, so we have an early test here. -static bool isIntegerPromotionViable(const DataLayout &TD, - Type *AllocaTy, - uint64_t AllocBeginOffset, - AllocaPartitioning &P, - AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { - IntegerType *Ty = dyn_cast(AllocaTy); - if (!Ty || 8*TD.getTypeStoreSize(Ty) != Ty->getBitWidth()) +/// This is a quick test to check whether we can rewrite the integer loads and +/// stores to a particular alloca into wider loads and stores and be able to +/// promote the resulting alloca. +static bool isIntegerWideningViable(const DataLayout &TD, + Type *AllocaTy, + uint64_t AllocBeginOffset, + AllocaPartitioning &P, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); + + // Don't try to handle allocas with bit-padding. + if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) return false; + uint64_t Size = TD.getTypeStoreSize(AllocaTy); + // Check the uses to ensure the uses are (likely) promoteable integer uses. // Also ensure that the alloca has a covering load or store. We don't want - // promote because of some other unsplittable entry (which we may make - // splittable later) and lose the ability to promote each element access. + // to widen the integer operotains only to fail to promote due to some other + // unsplittable entry (which we may make splittable later). bool WholeAllocaOp = false; for (; I != E; ++I) { if (!I->U) continue; // Skip dead use. + uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; + uint64_t RelEnd = I->EndOffset - AllocBeginOffset; + // We can't reasonably handle cases where the load or store extends past // the end of the aloca's type and into its padding. - if ((I->EndOffset - AllocBeginOffset) > TD.getTypeStoreSize(Ty)) + if (RelEnd > Size) return false; if (LoadInst *LI = dyn_cast(I->U->getUser())) { - if (LI->isVolatile() || !LI->getType()->isIntegerTy()) + if (LI->isVolatile()) return false; - if (LI->getType() == Ty) + if (RelBegin == 0 && RelEnd == Size) WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast(LI->getType())) { + if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) + return false; + continue; + } + // Non-integer loads need to be convertible from the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, AllocaTy, LI->getType())) + return false; } else if (StoreInst *SI = dyn_cast(I->U->getUser())) { - if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy()) + Type *ValueTy = SI->getValueOperand()->getType(); + if (SI->isVolatile()) return false; - if (SI->getValueOperand()->getType() == Ty) + if (RelBegin == 0 && RelEnd == Size) WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast(ValueTy)) { + if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) + return false; + continue; + } + // Non-integer stores need to be convertible to the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, ValueTy, AllocaTy)) + return false; } else if (MemIntrinsic *MI = dyn_cast(I->U->getUser())) { if (MI->isVolatile()) return false; @@ -2170,6 +2197,10 @@ static bool isIntegerPromotionViable(const DataLayout &TD, if (!MTO.IsSplittable) return false; } + } else if (IntrinsicInst *II = dyn_cast(I->U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; } else { return false; } @@ -2210,10 +2241,10 @@ class AllocaPartitionRewriter : public InstVisitorgetScalarSizeInBits() % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); ElementSize = VecTy->getScalarSizeInBits() / 8; - } else if (isIntegerPromotionViable(TD, NewAI.getAllocatedType(), - NewAllocaBeginOffset, P, I, E)) { - IntPromotionTy = cast(NewAI.getAllocatedType()); + } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), + NewAllocaBeginOffset, P, I, E)) { + IntTy = Type::getIntNTy(NewAI.getContext(), + TD.getTypeSizeInBits(NewAI.getAllocatedType())); } bool CanSROA = true; for (; I != E; ++I) { @@ -2270,6 +2302,10 @@ public: ElementTy = 0; ElementSize = 0; } + if (IntTy) { + assert(CanSROA); + IntTy = 0; + } return CanSROA; } @@ -2333,55 +2369,56 @@ private: Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy, uint64_t Offset) { - assert(IntPromotionTy && "Alloca is not an integer we can extract from"); + assert(IntTy && "We cannot extract an integer from the alloca"); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")); + V = convertValue(TD, IRB, V, IntTy); assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t RelOffset = Offset - NewAllocaBeginOffset; assert(TD.getTypeStoreSize(TargetTy) + RelOffset <= - TD.getTypeStoreSize(IntPromotionTy) && + TD.getTypeStoreSize(IntTy) && "Element load outside of alloca store"); uint64_t ShAmt = 8*RelOffset; if (TD.isBigEndian()) - ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) - + ShAmt = 8*(TD.getTypeStoreSize(IntTy) - TD.getTypeStoreSize(TargetTy) - RelOffset); if (ShAmt) V = IRB.CreateLShr(V, ShAmt, getName(".shift")); - if (TargetTy != IntPromotionTy) { - assert(TargetTy->getBitWidth() < IntPromotionTy->getBitWidth() && - "Cannot extract to a larger integer!"); + assert(TargetTy->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (TargetTy != IntTy) V = IRB.CreateTrunc(V, TargetTy, getName(".trunc")); - } return V; } StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) { IntegerType *Ty = cast(V->getType()); - if (Ty == IntPromotionTy) - return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); - - assert(Ty->getBitWidth() < IntPromotionTy->getBitWidth() && + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && "Cannot insert a larger integer!"); - V = IRB.CreateZExt(V, IntPromotionTy, getName(".ext")); + if (Ty != IntTy) + V = IRB.CreateZExt(V, IntTy, getName(".ext")); assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t RelOffset = Offset - NewAllocaBeginOffset; assert(TD.getTypeStoreSize(Ty) + RelOffset <= - TD.getTypeStoreSize(IntPromotionTy) && + TD.getTypeStoreSize(IntTy) && "Element store outside of alloca store"); uint64_t ShAmt = 8*RelOffset; if (TD.isBigEndian()) - ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) - TD.getTypeStoreSize(Ty) + ShAmt = 8*(TD.getTypeStoreSize(IntTy) - TD.getTypeStoreSize(Ty) - RelOffset); if (ShAmt) V = IRB.CreateShl(V, ShAmt, getName(".shift")); - APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth()).shl(ShAmt); - Value *Old = IRB.CreateAnd(IRB.CreateAlignedLoad(&NewAI, - NewAI.getAlignment(), - getName(".oldload")), - Mask, getName(".mask")); - return IRB.CreateAlignedStore(IRB.CreateOr(Old, V, getName(".insert")), - &NewAI, NewAI.getAlignment()); + if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { + APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + Old = IRB.CreateAnd(Old, Mask, getName(".mask")); + V = IRB.CreateOr(Old, V, getName(".insert")); + } + V = convertValue(TD, IRB, V, NewAllocaTy); + return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); } void deleteIfTriviallyDead(Value *V) { @@ -2428,7 +2465,7 @@ private: if (VecTy) return rewriteVectorizedLoadInst(IRB, LI, OldOp); - if (IntPromotionTy) + if (IntTy && LI.getType()->isIntegerTy()) return rewriteIntegerLoad(IRB, LI); if (BeginOffset == NewAllocaBeginOffset && @@ -2443,6 +2480,8 @@ private: return !LI.isVolatile(); } + assert(!IntTy && "Invalid load found with int-op widening enabled"); + Value *NewPtr = getAdjustedAllocaPtr(IRB, LI.getPointerOperand()->getType()); LI.setOperand(0, NewPtr); @@ -2492,10 +2531,9 @@ private: if (VecTy) return rewriteVectorizedStoreInst(IRB, SI, OldOp); - if (IntPromotionTy) - return rewriteIntegerStore(IRB, SI); - Type *ValueTy = SI.getValueOperand()->getType(); + if (IntTy && ValueTy->isIntegerTy()) + return rewriteIntegerStore(IRB, SI); // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after promoting this alloca. @@ -2516,6 +2554,8 @@ private: return !SI.isVolatile(); } + assert(!IntTy && "Invalid store found with int-op widening enabled"); + Value *NewPtr = getAdjustedAllocaPtr(IRB, SI.getPointerOperand()->getType()); SI.setOperand(1, NewPtr); diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll index 7d8e5cd..4a87d91 100644 --- a/llvm/test/Transforms/SROA/basictest.ll +++ b/llvm/test/Transforms/SROA/basictest.ll @@ -409,8 +409,11 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind define i16 @test5() { ; CHECK: @test5 -; CHECK: alloca float -; CHECK: ret i16 % +; CHECK-NOT: alloca float +; CHECK: %[[cast:.*]] = bitcast float 0.0{{.*}} to i32 +; CHECK-NEXT: %[[shr:.*]] = lshr i32 %[[cast]], 16 +; CHECK-NEXT: %[[trunc:.*]] = trunc i32 %[[shr]] to i16 +; CHECK-NEXT: ret i16 %[[trunc]] entry: %a = alloca [4 x i8] @@ -1012,3 +1015,34 @@ entry: ret i32 %valcast2 ; CHECK: ret i32 } + +define void @PR14059.1(double* %d) { +; In PR14059 a peculiar construct was identified as something that is used +; pervasively in ARM's ABI-calling-convention lowering: the passing of a struct +; of doubles via an array of i32 in order to place the data into integer +; registers. This in turn was missed as an optimization by SROA due to the +; partial loads and stores of integers to the double alloca we were trying to +; form and promote. The solution is to widen the integer operations to be +; whole-alloca operations, and perform the appropriate bitcasting on the +; *values* rather than the pointers. When this works, partial reads and writes +; via integers can be promoted away. +; CHECK: @PR14059.1 +; CHECK-NOT: alloca +; CHECK: ret void + +entry: + %X.sroa.0.i = alloca double, align 8 + %0 = bitcast double* %X.sroa.0.i to i8* + call void @llvm.lifetime.start(i64 -1, i8* %0) + %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32* + store i32 0, i32* %X.sroa.0.0.cast2.i, align 8 + %X.sroa.0.4.raw_idx4.i = getelementptr inbounds i8* %0, i32 4 + %X.sroa.0.4.cast5.i = bitcast i8* %X.sroa.0.4.raw_idx4.i to i32* + store i32 1072693248, i32* %X.sroa.0.4.cast5.i, align 4 + %X.sroa.0.0.load1.i = load double* %X.sroa.0.i, align 8 + %accum.real.i = load double* %d, align 8 + %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i + store double %add.r.i, double* %d, align 8 + call void @llvm.lifetime.end(i64 -1, i8* %0) + ret void +} diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll index 2b0724c..d95e48f 100644 --- a/llvm/test/Transforms/SROA/phi-and-select.ll +++ b/llvm/test/Transforms/SROA/phi-and-select.ll @@ -256,17 +256,17 @@ entry: ret i32 %loaded } -define i32 @test10(i32 %b, i32* %ptr) { +define float @test10(i32 %b, float* %ptr) { ; Don't try to promote allocas which are not elligible for it even after ; rewriting due to the necessity of inserting bitcasts when speculating a PHI ; node. ; CHECK: @test10 ; CHECK: %[[alloca:.*]] = alloca -; CHECK: %[[argvalue:.*]] = load i32* %ptr -; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to i32* -; CHECK: %[[allocavalue:.*]] = load i32* %[[cast]] -; CHECK: %[[result:.*]] = phi i32 [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ] -; CHECK-NEXT: ret i32 %[[result]] +; CHECK: %[[argvalue:.*]] = load float* %ptr +; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float* +; CHECK: %[[allocavalue:.*]] = load float* %[[cast]] +; CHECK: %[[result:.*]] = phi float [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ] +; CHECK-NEXT: ret float %[[result]] entry: %f = alloca double @@ -278,34 +278,34 @@ then: br label %exit else: - %bitcast = bitcast double* %f to i32* + %bitcast = bitcast double* %f to float* br label %exit exit: - %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ] - %loaded = load i32* %phi, align 4 - ret i32 %loaded + %phi = phi float* [ %bitcast, %else ], [ %ptr, %then ] + %loaded = load float* %phi, align 4 + ret float %loaded } -define i32 @test11(i32 %b, i32* %ptr) { +define float @test11(i32 %b, float* %ptr) { ; Same as @test10 but for a select rather than a PHI node. ; CHECK: @test11 ; CHECK: %[[alloca:.*]] = alloca -; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to i32* -; CHECK: %[[allocavalue:.*]] = load i32* %[[cast]] -; CHECK: %[[argvalue:.*]] = load i32* %ptr -; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 %[[allocavalue]], i32 %[[argvalue]] -; CHECK-NEXT: ret i32 %[[result]] +; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float* +; CHECK: %[[allocavalue:.*]] = load float* %[[cast]] +; CHECK: %[[argvalue:.*]] = load float* %ptr +; CHECK: %[[result:.*]] = select i1 %{{.*}}, float %[[allocavalue]], float %[[argvalue]] +; CHECK-NEXT: ret float %[[result]] entry: %f = alloca double store double 0.0, double* %f - store i32 0, i32* %ptr + store float 0.0, float* %ptr %test = icmp ne i32 %b, 0 - %bitcast = bitcast double* %f to i32* - %select = select i1 %test, i32* %bitcast, i32* %ptr - %loaded = load i32* %select, align 4 - ret i32 %loaded + %bitcast = bitcast double* %f to float* + %select = select i1 %test, float* %bitcast, float* %ptr + %loaded = load float* %select, align 4 + ret float %loaded } define i32 @test12(i32 %x, i32* %p) { -- 2.7.4