From 009faed1be6c0daded75d0369b08a44460ec6736 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Sep 2013 05:09:42 +0000 Subject: [PATCH] Teach loop-idiom about address space pointer sizes llvm-svn: 190491 --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 33 +++++--- .../IndVarSimplify/lftr-address-space-pointers.ll | 69 ++++++++++++++++ .../Transforms/LoopIdiom/basic-address-space.ll | 91 ++++++++++++++++++++++ .../LoopStrengthReduce/addrec-gep-address-space.ll | 88 +++++++++++++++++++++ .../LoopStrengthReduce/address-space-loop.ll | 56 +++++++++++++ .../LoopStrengthReduce/uglygep-address-space.ll | 56 +++++++++++++ 6 files changed, 381 insertions(+), 12 deletions(-) create mode 100644 llvm/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll create mode 100644 llvm/test/Transforms/LoopIdiom/basic-address-space.ll create mode 100644 llvm/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll create mode 100644 llvm/test/Transforms/LoopStrengthReduce/address-space-loop.ll create mode 100644 llvm/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 20fccea..32af415 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = 0; + unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); + // If we're allowed to form a memset, and the stored value would be acceptable // for memset, use it. if (SplatValue && TLI->has(LibFunc::memset) && @@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. PatternValue = 0; - } else if (TLI->has(LibFunc::memset_pattern16) && + } else if (DestAS == 0 && + TLI->has(LibFunc::memset_pattern16) && (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! SplatValue = 0; } else { @@ -978,14 +982,15 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, "loop-idiom"); + Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. - unsigned AddrSpace = cast(DestPtr->getType())->getAddressSpace(); Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, Preheader->getTerminator()); if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, @@ -1001,7 +1006,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + Type *IntPtr = Builder.getIntPtrTy(TD, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), @@ -1021,11 +1026,15 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, NumBytes, StoreAlignment); } else { + // Everything is emitted in default address space + Type *Int8PtrTy = DestInt8PtrTy; + Module *M = TheStore->getParent()->getParent()->getParent(); Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), - Builder.getInt8PtrTy(), - Builder.getInt8PtrTy(), IntPtr, + Int8PtrTy, + Int8PtrTy, + IntPtr, (void*)0); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -1035,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(16); - Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); } @@ -1111,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(SI->getContext()); - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), SCEV::FlagNUW); if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll b/llvm/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll new file mode 100644 index 0000000..e4c31d1 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -indvars -o - %s | FileCheck %s +target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-n8:16:32:64" + +; Derived from ptriv in lftr-reuse.ll +define void @ptriv_as2(i8 addrspace(2)* %base, i32 %n) nounwind { +; CHECK-LABEL: @ptriv_as2( +entry: + %idx.trunc = trunc i32 %n to i8 + %add.ptr = getelementptr inbounds i8 addrspace(2)* %base, i8 %idx.trunc + %cmp1 = icmp ult i8 addrspace(2)* %base, %add.ptr + br i1 %cmp1, label %for.body, label %for.end + +; Make sure the added GEP has the right index type +; CHECK: %lftr.limit = getelementptr i8 addrspace(2)* %base, i8 %0 + +; CHECK: for.body: +; CHECK: phi i8 addrspace(2)* +; CHECK-NOT: phi +; CHECK-NOT: add{{^rspace}} +; CHECK: icmp ne i8 addrspace(2)* +; CHECK: br i1 +for.body: + %p.02 = phi i8 addrspace(2)* [ %base, %entry ], [ %incdec.ptr, %for.body ] + ; cruft to make the IV useful + %sub.ptr.lhs.cast = ptrtoint i8 addrspace(2)* %p.02 to i8 + %sub.ptr.rhs.cast = ptrtoint i8 addrspace(2)* %base to i8 + %sub.ptr.sub = sub i8 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + store i8 %sub.ptr.sub, i8 addrspace(2)* %p.02 + %incdec.ptr = getelementptr inbounds i8 addrspace(2)* %p.02, i32 1 + %cmp = icmp ult i8 addrspace(2)* %incdec.ptr, %add.ptr + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + +define void @ptriv_as3(i8 addrspace(3)* %base, i32 %n) nounwind { +; CHECK-LABEL: @ptriv_as3( +entry: + %idx.trunc = trunc i32 %n to i16 + %add.ptr = getelementptr inbounds i8 addrspace(3)* %base, i16 %idx.trunc + %cmp1 = icmp ult i8 addrspace(3)* %base, %add.ptr + br i1 %cmp1, label %for.body, label %for.end + +; Make sure the added GEP has the right index type +; CHECK: %lftr.limit = getelementptr i8 addrspace(3)* %base, i16 %0 + +; CHECK: for.body: +; CHECK: phi i8 addrspace(3)* +; CHECK-NOT: phi +; CHECK-NOT: add{{^rspace}} +; CHECK: icmp ne i8 addrspace(3)* +; CHECK: br i1 +for.body: + %p.02 = phi i8 addrspace(3)* [ %base, %entry ], [ %incdec.ptr, %for.body ] + ; cruft to make the IV useful + %sub.ptr.lhs.cast = ptrtoint i8 addrspace(3)* %p.02 to i16 + %sub.ptr.rhs.cast = ptrtoint i8 addrspace(3)* %base to i16 + %sub.ptr.sub = sub i16 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast + %conv = trunc i16 %sub.ptr.sub to i8 + store i8 %conv, i8 addrspace(3)* %p.02 + %incdec.ptr = getelementptr inbounds i8 addrspace(3)* %p.02, i32 1 + %cmp = icmp ult i8 addrspace(3)* %incdec.ptr, %add.ptr + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + diff --git a/llvm/test/Transforms/LoopIdiom/basic-address-space.ll b/llvm/test/Transforms/LoopIdiom/basic-address-space.ll new file mode 100644 index 0000000..697ab372 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/basic-address-space.ll @@ -0,0 +1,91 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s + +target datalayout = "e-p:32:32:32-p1:64:64:64-p2:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-n8:16:32:64" +target triple = "x86_64-apple-darwin10.0.0" + +; Two dimensional nested loop should be promoted to one big memset. +define void @test10(i8 addrspace(2)* %X) nounwind ssp { +; CHECK-LABEL: @test10( +; CHECK: entry: +; CHECK-NEXT: call void @llvm.memset.p2i8.i16(i8 addrspace(2)* %X, i8 0, i16 10000, i32 1, i1 false) +; CHECK-NOT: store +; CHECK: ret void + +entry: + br label %bb.nph + +bb.nph: ; preds = %entry, %for.inc10 + %i.04 = phi i16 [ 0, %entry ], [ %inc12, %for.inc10 ] + br label %for.body5 + +for.body5: ; preds = %for.body5, %bb.nph + %j.02 = phi i16 [ 0, %bb.nph ], [ %inc, %for.body5 ] + %mul = mul nsw i16 %i.04, 100 + %add = add nsw i16 %j.02, %mul + %arrayidx = getelementptr inbounds i8 addrspace(2)* %X, i16 %add + store i8 0, i8 addrspace(2)* %arrayidx, align 1 + %inc = add nsw i16 %j.02, 1 + %cmp4 = icmp eq i16 %inc, 100 + br i1 %cmp4, label %for.inc10, label %for.body5 + +for.inc10: ; preds = %for.body5 + %inc12 = add nsw i16 %i.04, 1 + %cmp = icmp eq i16 %inc12, 100 + br i1 %cmp, label %for.end13, label %bb.nph + +for.end13: ; preds = %for.inc10 + ret void +} + +define void @test11_pattern(i32 addrspace(2)* nocapture %P) nounwind ssp { +; CHECK-LABEL: @test11_pattern( +; CHECK-NOT: memset_pattern +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32 addrspace(2)* %P, i64 %indvar + store i32 1, i32 addrspace(2)* %arrayidx, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; PR9815 - This is a partial overlap case that cannot be safely transformed +; into a memcpy. +@g_50 = addrspace(2) global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16 + + +define i32 @test14() nounwind { +; CHECK-LABEL: @test14( +; CHECK: for.body: +; CHECK: load i32 +; CHECK: store i32 +; CHECK: br i1 %cmp + +entry: + br label %for.body + +for.body: ; preds = %for.inc, %for.body.lr.ph + %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %add = add nsw i32 %tmp5, 4 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds [7 x i32] addrspace(2)* @g_50, i32 0, i64 %idxprom + %tmp2 = load i32 addrspace(2)* %arrayidx, align 4 + %add4 = add nsw i32 %tmp5, 5 + %idxprom5 = sext i32 %add4 to i64 + %arrayidx6 = getelementptr inbounds [7 x i32] addrspace(2)* @g_50, i32 0, i64 %idxprom5 + store i32 %tmp2, i32 addrspace(2)* %arrayidx6, align 4 + %inc = add nsw i32 %tmp5, 1 + %cmp = icmp slt i32 %inc, 2 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.inc + %tmp8 = load i32 addrspace(2)* getelementptr inbounds ([7 x i32] addrspace(2)* @g_50, i32 0, i64 6), align 4 + ret i32 %tmp8 +} + diff --git a/llvm/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll b/llvm/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll new file mode 100644 index 0000000..6333291 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll @@ -0,0 +1,88 @@ +; RUN: opt < %s -loop-reduce -S | FileCheck %s +; CHECK: bb1: +; CHECK: load double addrspace(1)* [[IV:%[^,]+]] +; CHECK: store double {{.*}}, double addrspace(1)* [[IV]] + +; CHECK-NOT: cast +; Make sure the GEP has the right index type +; CHECK: getelementptr double addrspace(1)* [[IV]], i16 1 +; CHECK: br {{.*}} label %bb1 + +; Make sure the GEP has the right index type +; CHECK: getelementptr double addrspace(1)* {{.*}}, i16 + + +; This test tests several things. The load and store should use the +; same address instead of having it computed twice, and SCEVExpander should +; be able to reconstruct the full getelementptr, despite it having a few +; obstacles set in its way. +; We only check that the inner loop (bb1-bb2) is "reduced" because LSR +; currently only operates on inner loops. + +target datalayout = "e-p:64:64:64-p1:16:16:16-n16:32:64" + +define void @foo(i64 %n, i64 %m, i64 %o, i64 %q, double addrspace(1)* nocapture %p) nounwind { +entry: + %tmp = icmp sgt i64 %n, 0 ; [#uses=1] + br i1 %tmp, label %bb.nph3, label %return + +bb.nph: ; preds = %bb2.preheader + %tmp1 = mul i64 %tmp16, %i.02 ; [#uses=1] + %tmp2 = mul i64 %tmp19, %i.02 ; [#uses=1] + br label %bb1 + +bb1: ; preds = %bb2, %bb.nph + %j.01 = phi i64 [ %tmp9, %bb2 ], [ 0, %bb.nph ] ; [#uses=3] + %tmp3 = add i64 %j.01, %tmp1 ; [#uses=1] + %tmp4 = add i64 %j.01, %tmp2 ; [#uses=1] + %z0 = add i64 %tmp3, 5203 + %tmp5 = getelementptr double addrspace(1)* %p, i64 %z0 ; [#uses=1] + %tmp6 = load double addrspace(1)* %tmp5, align 8 ; [#uses=1] + %tmp7 = fdiv double %tmp6, 2.100000e+00 ; [#uses=1] + %z1 = add i64 %tmp4, 5203 + %tmp8 = getelementptr double addrspace(1)* %p, i64 %z1 ; [#uses=1] + store double %tmp7, double addrspace(1)* %tmp8, align 8 + %tmp9 = add i64 %j.01, 1 ; [#uses=2] + br label %bb2 + +bb2: ; preds = %bb1 + %tmp10 = icmp slt i64 %tmp9, %m ; [#uses=1] + br i1 %tmp10, label %bb1, label %bb2.bb3_crit_edge + +bb2.bb3_crit_edge: ; preds = %bb2 + br label %bb3 + +bb3: ; preds = %bb2.preheader, %bb2.bb3_crit_edge + %tmp11 = add i64 %i.02, 1 ; [#uses=2] + br label %bb4 + +bb4: ; preds = %bb3 + %tmp12 = icmp slt i64 %tmp11, %n ; [#uses=1] + br i1 %tmp12, label %bb2.preheader, label %bb4.return_crit_edge + +bb4.return_crit_edge: ; preds = %bb4 + br label %bb4.return_crit_edge.split + +bb4.return_crit_edge.split: ; preds = %bb.nph3, %bb4.return_crit_edge + br label %return + +bb.nph3: ; preds = %entry + %tmp13 = icmp sgt i64 %m, 0 ; [#uses=1] + %tmp14 = mul i64 %n, 37 ; [#uses=1] + %tmp15 = mul i64 %tmp14, %o ; [#uses=1] + %tmp16 = mul i64 %tmp15, %q ; [#uses=1] + %tmp17 = mul i64 %n, 37 ; [#uses=1] + %tmp18 = mul i64 %tmp17, %o ; [#uses=1] + %tmp19 = mul i64 %tmp18, %q ; [#uses=1] + br i1 %tmp13, label %bb.nph3.split, label %bb4.return_crit_edge.split + +bb.nph3.split: ; preds = %bb.nph3 + br label %bb2.preheader + +bb2.preheader: ; preds = %bb.nph3.split, %bb4 + %i.02 = phi i64 [ %tmp11, %bb4 ], [ 0, %bb.nph3.split ] ; [#uses=3] + br i1 true, label %bb.nph, label %bb3 + +return: ; preds = %bb4.return_crit_edge.split, %entry + ret void +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/address-space-loop.ll b/llvm/test/Transforms/LoopStrengthReduce/address-space-loop.ll new file mode 100644 index 0000000..9c1b213 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/address-space-loop.ll @@ -0,0 +1,56 @@ +; RUN: opt -S -loop-reduce < %s | FileCheck %s + +; LSR shouldn't consider %t8 to be an interesting user of %t6, and it +; should be able to form pretty GEPs. + +target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +; Copy of uglygep with a different address space +; This tests expandAddToGEP uses the right smaller integer type for +; another address space +define void @Z4() nounwind { +; CHECK-LABEL: @Z4( +bb: + br label %bb3 + +bb1: ; preds = %bb3 + br i1 undef, label %bb10, label %bb2 + +bb2: ; preds = %bb1 + %t = add i16 %t4, 1 ; [#uses=1] + br label %bb3 + +bb3: ; preds = %bb2, %bb + %t4 = phi i16 [ %t, %bb2 ], [ 0, %bb ] ; [#uses=3] + br label %bb1 + +; CHECK: bb10: +; CHECK-NEXT: %t7 = icmp eq i16 %t4, 0 +; Host %t2 computation outside the loop. +; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8 addrspace(1)* undef, i16 %t4 +; CHECK-NEXT: br label %bb14 +bb10: ; preds = %bb9 + %t7 = icmp eq i16 %t4, 0 ; [#uses=1] + %t3 = add i16 %t4, 16 ; [#uses=1] + br label %bb14 + +; CHECK: bb14: +; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[SCEVGEP]] +; CHECK-NEXT: %t6 = load float addrspace(1)* addrspace(1)* undef +; Fold %t3's add within the address. +; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float addrspace(1)* %t6, i16 4 +; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float addrspace(1)* [[SCEVGEP1]] to i8 addrspace(1)* +; Use the induction variable (%t4) to access the right element +; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8 addrspace(1)* [[SCEVGEP2]], i16 %t4 +; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[ADDRESS]] +; CHECK-NEXT: br label %bb14 +bb14: ; preds = %bb14, %bb10 + %t2 = getelementptr inbounds i8 addrspace(1)* undef, i16 %t4 ; [#uses=1] + store i8 undef, i8 addrspace(1)* %t2 + %t6 = load float addrspace(1)* addrspace(1)* undef + %t8 = bitcast float addrspace(1)* %t6 to i8 addrspace(1)* ; [#uses=1] + %t9 = getelementptr inbounds i8 addrspace(1)* %t8, i16 %t3 ; [#uses=1] + store i8 undef, i8 addrspace(1)* %t9 + br label %bb14 +} + diff --git a/llvm/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll b/llvm/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll new file mode 100644 index 0000000..2c65261 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +; LSR shouldn't consider %t8 to be an interesting user of %t6, and it +; should be able to form pretty GEPs. + +target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +; Copy of uglygep with a different address space +; This tests expandAddToGEP uses the right smaller integer type for +; another address space +define void @Z4() nounwind { +; CHECK: define void @Z4 +bb: + br label %bb3 + +bb1: ; preds = %bb3 + br i1 undef, label %bb10, label %bb2 + +bb2: ; preds = %bb1 + %t = add i16 %t4, 1 ; [#uses=1] + br label %bb3 + +bb3: ; preds = %bb2, %bb + %t4 = phi i16 [ %t, %bb2 ], [ 0, %bb ] ; [#uses=3] + br label %bb1 + +; CHECK: bb10: +; CHECK-NEXT: %t7 = icmp eq i16 %t4, 0 +; Host %t2 computation outside the loop. +; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8 addrspace(1)* undef, i16 %t4 +; CHECK-NEXT: br label %bb14 +bb10: ; preds = %bb9 + %t7 = icmp eq i16 %t4, 0 ; [#uses=1] + %t3 = add i16 %t4, 16 ; [#uses=1] + br label %bb14 + +; CHECK: bb14: +; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[SCEVGEP]] +; CHECK-NEXT: %t6 = load float addrspace(1)* addrspace(1)* undef +; Fold %t3's add within the address. +; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float addrspace(1)* %t6, i16 4 +; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float addrspace(1)* [[SCEVGEP1]] to i8 addrspace(1)* +; Use the induction variable (%t4) to access the right element +; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8 addrspace(1)* [[SCEVGEP2]], i16 %t4 +; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[ADDRESS]] +; CHECK-NEXT: br label %bb14 +bb14: ; preds = %bb14, %bb10 + %t2 = getelementptr inbounds i8 addrspace(1)* undef, i16 %t4 ; [#uses=1] + store i8 undef, i8 addrspace(1)* %t2 + %t6 = load float addrspace(1)* addrspace(1)* undef + %t8 = bitcast float addrspace(1)* %t6 to i8 addrspace(1)* ; [#uses=1] + %t9 = getelementptr inbounds i8 addrspace(1)* %t8, i16 %t3 ; [#uses=1] + store i8 undef, i8 addrspace(1)* %t9 + br label %bb14 +} + -- 2.7.4