From e735f2bf379285294c41748eec3e95e3e8bd1acd Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 1 Sep 2021 13:07:32 -0700
Subject: [PATCH] [SCEVExpander] Prefer pointer expansion for overflow checks

We'd special cased this logic to use pointer types for non-integral pointers, but there's no reason we can't do that for all pointer types.   Doing it this was has a few advantages:
a) The code itself becomes more straight forward, and easier to test.
b) We avoid introducing ptrtoint into programs which didn't have them in the source.
c) The resulting codegen is easier to analyze and simplify (mostly due to lack of ptrtoint).

Note that there are some test diffs, but a) running them through instcombine helps a ton, and b) there's enough missing obvious transforms on both before and after IR that it's clear this isn't performance sensitive.

This is mostly motivated by cleaning up mentions of non-integrals to have a clearer idea of what we actually need to support.

Differential Revision: https://reviews.llvm.org/D104662
---
 .../Transforms/Utils/ScalarEvolutionExpander.cpp   |   8 +-
 .../LoopDistribute/scev-inserted-runtime-check.ll  |  56 ++++----
 .../LoopVersioning/wrapping-pointer-versioning.ll  | 158 ++++++++++++---------
 3 files changed, 122 insertions(+), 100 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 38978c3..ba4876b 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2461,15 +2461,11 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
 
   IntegerType *Ty =
       IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
-  Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
 
   Value *StepValue = expandCodeForImpl(Step, Ty, Loc, false);
   Value *NegStepValue =
       expandCodeForImpl(SE.getNegativeSCEV(Step), Ty, Loc, false);
-  Value *StartValue = expandCodeForImpl(
-      isa<PointerType>(ARExpandTy) ? Start
-                                   : SE.getPtrToIntExpr(Start, ARExpandTy),
-      ARExpandTy, Loc, false);
+  Value *StartValue = expandCodeForImpl(Start, ARTy, Loc, false);
 
   ConstantInt *Zero =
       ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
@@ -2493,7 +2489,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   //   Start + |Step| * Backedge < Start
   //   Start - |Step| * Backedge > Start
   Value *Add = nullptr, *Sub = nullptr;
-  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) {
+  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) {
     const SCEV *MulS = SE.getSCEV(MulV);
     const SCEV *NegMulS = SE.getNegativeSCEV(MulS);
     Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue),
diff --git a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
index b407ed1..da2aa44 100644
--- a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
+++ b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; CHECK-NEXT:    [[A5:%.*]] = bitcast i32* [[A:%.*]] to i8*
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
 ; CHECK:       for.body.lver.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -1
@@ -26,17 +26,21 @@ define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 false, [[TMP9]]
-; CHECK-NEXT:    [[MUL3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]])
-; CHECK-NEXT:    [[MUL_RESULT4:%.*]] = extractvalue { i64, i1 } [[MUL3]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW5:%.*]] = extractvalue { i64, i1 } [[MUL3]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[A2]], [[MUL_RESULT4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[A2]], [[MUL_RESULT4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP12]], [[A2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i64 [[TMP11]], [[A2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 false, i1 [[TMP13]], i1 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP10]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A5]], i64 [[MUL_RESULT3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[UGLYGEP]] to i32*
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[MUL_RESULT3]], -8
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 8, [[TMP12]]
+; CHECK-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[A5]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[UGLYGEP6]] to i32*
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt i32* [[TMP14]], [[A]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i32* [[TMP11]], [[A]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 false, i1 [[TMP15]], i1 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
 ; CHECK:       for.body.ph.lver.orig:
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; CHECK:       for.body.lver.orig:
@@ -97,10 +101,10 @@ define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %
 ; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[MUL_EXT]]
 ; CHECK-NEXT:    store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT7:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end.loopexit6:
+; CHECK:       for.end.loopexit7:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
@@ -177,14 +181,18 @@ define void @f_with_offset(i32* noalias %b, i32* noalias %c, i32* noalias %d, i3
 ; CHECK-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]])
 ; CHECK-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 add (i64 ptrtoint ([8192 x i32]* @global_a to i64), i64 168), [[MUL_RESULT3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 add (i64 ptrtoint ([8192 x i32]* @global_a to i64), i64 168), [[MUL_RESULT3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP12]], add (i64 ptrtoint ([8192 x i32]* @global_a to i64), i64 168)
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i64 [[TMP11]], add (i64 ptrtoint ([8192 x i32]* @global_a to i64), i64 168)
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 false, i1 [[TMP13]], i1 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP10]], [[TMP16]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to i8*), i64 [[MUL_RESULT3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[UGLYGEP]] to [8192 x i32]*
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[MUL_RESULT3]], -8
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 8, [[TMP12]]
+; CHECK-NEXT:    [[UGLYGEP5:%.*]] = getelementptr i8, i8* bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to i8*), i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[UGLYGEP5]] to [8192 x i32]*
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt [8192 x i32]* [[TMP14]], bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to [8192 x i32]*)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult [8192 x i32]* [[TMP11]], bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to [8192 x i32]*)
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 false, i1 [[TMP15]], i1 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
 ; CHECK:       for.body.ph.lver.orig:
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; CHECK:       for.body.lver.orig:
@@ -245,10 +253,10 @@ define void @f_with_offset(i32* noalias %b, i32* noalias %c, i32* noalias %d, i3
 ; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[MUL_EXT]]
 ; CHECK-NEXT:    store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT5:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end.loopexit5:
+; CHECK:       for.end.loopexit6:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll
index 0752cde..2a4039d 100644
--- a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll
+++ b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll
@@ -28,7 +28,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @f1(i16* noalias %a,
 ; LV-LABEL: @f1(
 ; LV-NEXT:  for.body.lver.check:
-; LV-NEXT:    [[A2:%.*]] = ptrtoint i16* [[A:%.*]] to i64
+; LV-NEXT:    [[A5:%.*]] = bitcast i16* [[A:%.*]] to i8*
 ; LV-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -1
 ; LV-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; LV-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP1]])
@@ -43,17 +43,21 @@ define void @f1(i16* noalias %a,
 ; LV-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; LV-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
 ; LV-NEXT:    [[TMP10:%.*]] = or i1 false, [[TMP9]]
-; LV-NEXT:    [[MUL3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
-; LV-NEXT:    [[MUL_RESULT4:%.*]] = extractvalue { i64, i1 } [[MUL3]], 0
-; LV-NEXT:    [[MUL_OVERFLOW5:%.*]] = extractvalue { i64, i1 } [[MUL3]], 1
-; LV-NEXT:    [[TMP11:%.*]] = add i64 [[A2]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP12:%.*]] = sub i64 [[A2]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP12]], [[A2]]
-; LV-NEXT:    [[TMP14:%.*]] = icmp ult i64 [[TMP11]], [[A2]]
-; LV-NEXT:    [[TMP15:%.*]] = select i1 false, i1 [[TMP13]], i1 [[TMP14]]
-; LV-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW5]]
-; LV-NEXT:    [[TMP17:%.*]] = or i1 [[TMP10]], [[TMP16]]
-; LV-NEXT:    br i1 [[TMP17]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
+; LV-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
+; LV-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; LV-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; LV-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A5]], i64 [[MUL_RESULT3]]
+; LV-NEXT:    [[TMP11:%.*]] = bitcast i8* [[UGLYGEP]] to i16*
+; LV-NEXT:    [[TMP12:%.*]] = sub i64 [[MUL_RESULT3]], -4
+; LV-NEXT:    [[TMP13:%.*]] = sub i64 4, [[TMP12]]
+; LV-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[A5]], i64 [[TMP13]]
+; LV-NEXT:    [[TMP14:%.*]] = bitcast i8* [[UGLYGEP6]] to i16*
+; LV-NEXT:    [[TMP15:%.*]] = icmp ugt i16* [[TMP14]], [[A]]
+; LV-NEXT:    [[TMP16:%.*]] = icmp ult i16* [[TMP11]], [[A]]
+; LV-NEXT:    [[TMP17:%.*]] = select i1 false, i1 [[TMP15]], i1 [[TMP16]]
+; LV-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW4]]
+; LV-NEXT:    [[TMP19:%.*]] = or i1 [[TMP10]], [[TMP18]]
+; LV-NEXT:    br i1 [[TMP19]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
 ; LV:       for.body.ph.lver.orig:
 ; LV-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; LV:       for.body.lver.orig:
@@ -87,10 +91,10 @@ define void @f1(i16* noalias %a,
 ; LV-NEXT:    [[INC]] = add nuw nsw i64 [[IND]], 1
 ; LV-NEXT:    [[INC1]] = add i32 [[IND1]], 1
 ; LV-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[N]]
-; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT7:%.*]], label [[FOR_BODY]]
 ; LV:       for.end.loopexit:
 ; LV-NEXT:    br label [[FOR_END:%.*]]
-; LV:       for.end.loopexit6:
+; LV:       for.end.loopexit7:
 ; LV-NEXT:    br label [[FOR_END]]
 ; LV:       for.end:
 ; LV-NEXT:    ret void
@@ -153,7 +157,6 @@ for.end:                                          ; preds = %for.body
 define void @f2(i16* noalias %a,
 ; LV-LABEL: @f2(
 ; LV-NEXT:  for.body.lver.check:
-; LV-NEXT:    [[A2:%.*]] = ptrtoint i16* [[A:%.*]] to i64
 ; LV-NEXT:    [[TRUNCN:%.*]] = trunc i64 [[N:%.*]] to i32
 ; LV-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
 ; LV-NEXT:    [[TMP1:%.*]] = shl i32 [[TRUNCN]], 1
@@ -172,19 +175,24 @@ define void @f2(i16* noalias %a,
 ; LV-NEXT:    [[TMP11:%.*]] = or i1 false, [[TMP10]]
 ; LV-NEXT:    [[TMP12:%.*]] = trunc i64 [[N]] to i31
 ; LV-NEXT:    [[TMP13:%.*]] = zext i31 [[TMP12]] to i64
-; LV-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; LV-NEXT:    [[TMP15:%.*]] = add i64 [[A2]], [[TMP14]]
-; LV-NEXT:    [[MUL3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
-; LV-NEXT:    [[MUL_RESULT4:%.*]] = extractvalue { i64, i1 } [[MUL3]], 0
-; LV-NEXT:    [[MUL_OVERFLOW5:%.*]] = extractvalue { i64, i1 } [[MUL3]], 1
-; LV-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP15]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP18:%.*]] = icmp ugt i64 [[TMP17]], [[TMP15]]
-; LV-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP16]], [[TMP15]]
-; LV-NEXT:    [[TMP20:%.*]] = select i1 true, i1 [[TMP18]], i1 [[TMP19]]
-; LV-NEXT:    [[TMP21:%.*]] = or i1 [[TMP20]], [[MUL_OVERFLOW5]]
-; LV-NEXT:    [[TMP22:%.*]] = or i1 [[TMP11]], [[TMP21]]
-; LV-NEXT:    br i1 [[TMP22]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
+; LV-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 1
+; LV-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, i16* [[A:%.*]], i64 [[TMP14]]
+; LV-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
+; LV-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; LV-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; LV-NEXT:    [[SCEVGEP5:%.*]] = bitcast i16* [[SCEVGEP]] to i8*
+; LV-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 [[MUL_RESULT3]]
+; LV-NEXT:    [[TMP15:%.*]] = bitcast i8* [[UGLYGEP]] to i16*
+; LV-NEXT:    [[TMP16:%.*]] = sub i64 [[MUL_RESULT3]], -4
+; LV-NEXT:    [[TMP17:%.*]] = sub i64 4, [[TMP16]]
+; LV-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 [[TMP17]]
+; LV-NEXT:    [[TMP18:%.*]] = bitcast i8* [[UGLYGEP6]] to i16*
+; LV-NEXT:    [[TMP19:%.*]] = icmp ugt i16* [[TMP18]], [[SCEVGEP]]
+; LV-NEXT:    [[TMP20:%.*]] = icmp ult i16* [[TMP15]], [[SCEVGEP]]
+; LV-NEXT:    [[TMP21:%.*]] = select i1 true, i1 [[TMP19]], i1 [[TMP20]]
+; LV-NEXT:    [[TMP22:%.*]] = or i1 [[TMP21]], [[MUL_OVERFLOW4]]
+; LV-NEXT:    [[TMP23:%.*]] = or i1 [[TMP11]], [[TMP22]]
+; LV-NEXT:    br i1 [[TMP23]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
 ; LV:       for.body.ph.lver.orig:
 ; LV-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; LV:       for.body.lver.orig:
@@ -218,10 +226,10 @@ define void @f2(i16* noalias %a,
 ; LV-NEXT:    [[INC]] = add nuw nsw i64 [[IND]], 1
 ; LV-NEXT:    [[DEC]] = sub i32 [[IND1]], 1
 ; LV-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[N]]
-; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT7:%.*]], label [[FOR_BODY]]
 ; LV:       for.end.loopexit:
 ; LV-NEXT:    br label [[FOR_END:%.*]]
-; LV:       for.end.loopexit6:
+; LV:       for.end.loopexit7:
 ; LV-NEXT:    br label [[FOR_END]]
 ; LV:       for.end:
 ; LV-NEXT:    ret void
@@ -269,7 +277,7 @@ for.end:                                          ; preds = %for.body
 define void @f3(i16* noalias %a,
 ; LV-LABEL: @f3(
 ; LV-NEXT:  for.body.lver.check:
-; LV-NEXT:    [[A2:%.*]] = ptrtoint i16* [[A:%.*]] to i64
+; LV-NEXT:    [[A5:%.*]] = bitcast i16* [[A:%.*]] to i8*
 ; LV-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -1
 ; LV-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; LV-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP1]])
@@ -284,17 +292,21 @@ define void @f3(i16* noalias %a,
 ; LV-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; LV-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
 ; LV-NEXT:    [[TMP10:%.*]] = or i1 false, [[TMP9]]
-; LV-NEXT:    [[MUL3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
-; LV-NEXT:    [[MUL_RESULT4:%.*]] = extractvalue { i64, i1 } [[MUL3]], 0
-; LV-NEXT:    [[MUL_OVERFLOW5:%.*]] = extractvalue { i64, i1 } [[MUL3]], 1
-; LV-NEXT:    [[TMP11:%.*]] = add i64 [[A2]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP12:%.*]] = sub i64 [[A2]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP12]], [[A2]]
-; LV-NEXT:    [[TMP14:%.*]] = icmp ult i64 [[TMP11]], [[A2]]
-; LV-NEXT:    [[TMP15:%.*]] = select i1 false, i1 [[TMP13]], i1 [[TMP14]]
-; LV-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW5]]
-; LV-NEXT:    [[TMP17:%.*]] = or i1 [[TMP10]], [[TMP16]]
-; LV-NEXT:    br i1 [[TMP17]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
+; LV-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
+; LV-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; LV-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; LV-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A5]], i64 [[MUL_RESULT3]]
+; LV-NEXT:    [[TMP11:%.*]] = bitcast i8* [[UGLYGEP]] to i16*
+; LV-NEXT:    [[TMP12:%.*]] = sub i64 [[MUL_RESULT3]], -4
+; LV-NEXT:    [[TMP13:%.*]] = sub i64 4, [[TMP12]]
+; LV-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[A5]], i64 [[TMP13]]
+; LV-NEXT:    [[TMP14:%.*]] = bitcast i8* [[UGLYGEP6]] to i16*
+; LV-NEXT:    [[TMP15:%.*]] = icmp ugt i16* [[TMP14]], [[A]]
+; LV-NEXT:    [[TMP16:%.*]] = icmp ult i16* [[TMP11]], [[A]]
+; LV-NEXT:    [[TMP17:%.*]] = select i1 false, i1 [[TMP15]], i1 [[TMP16]]
+; LV-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW4]]
+; LV-NEXT:    [[TMP19:%.*]] = or i1 [[TMP10]], [[TMP18]]
+; LV-NEXT:    br i1 [[TMP19]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
 ; LV:       for.body.ph.lver.orig:
 ; LV-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; LV:       for.body.lver.orig:
@@ -328,10 +340,10 @@ define void @f3(i16* noalias %a,
 ; LV-NEXT:    [[INC]] = add nuw nsw i64 [[IND]], 1
 ; LV-NEXT:    [[INC1]] = add i32 [[IND1]], 1
 ; LV-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[N]]
-; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT7:%.*]], label [[FOR_BODY]]
 ; LV:       for.end.loopexit:
 ; LV-NEXT:    br label [[FOR_END:%.*]]
-; LV:       for.end.loopexit6:
+; LV:       for.end.loopexit7:
 ; LV-NEXT:    br label [[FOR_END]]
 ; LV:       for.end:
 ; LV-NEXT:    ret void
@@ -370,7 +382,6 @@ for.end:                                          ; preds = %for.body
 define void @f4(i16* noalias %a,
 ; LV-LABEL: @f4(
 ; LV-NEXT:  for.body.lver.check:
-; LV-NEXT:    [[A2:%.*]] = ptrtoint i16* [[A:%.*]] to i64
 ; LV-NEXT:    [[TRUNCN:%.*]] = trunc i64 [[N:%.*]] to i32
 ; LV-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
 ; LV-NEXT:    [[TMP1:%.*]] = shl i32 [[TRUNCN]], 1
@@ -388,17 +399,21 @@ define void @f4(i16* noalias %a,
 ; LV-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
 ; LV-NEXT:    [[TMP11:%.*]] = or i1 false, [[TMP10]]
 ; LV-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP1]] to i64
-; LV-NEXT:    [[TMP13:%.*]] = shl nsw i64 [[TMP12]], 1
-; LV-NEXT:    [[TMP14:%.*]] = add i64 [[A2]], [[TMP13]]
-; LV-NEXT:    [[MUL3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
-; LV-NEXT:    [[MUL_RESULT4:%.*]] = extractvalue { i64, i1 } [[MUL3]], 0
-; LV-NEXT:    [[MUL_OVERFLOW5:%.*]] = extractvalue { i64, i1 } [[MUL3]], 1
-; LV-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[TMP16]], [[TMP14]]
-; LV-NEXT:    [[TMP18:%.*]] = icmp ult i64 [[TMP15]], [[TMP14]]
+; LV-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, i16* [[A:%.*]], i64 [[TMP12]]
+; LV-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
+; LV-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; LV-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; LV-NEXT:    [[SCEVGEP5:%.*]] = bitcast i16* [[SCEVGEP]] to i8*
+; LV-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 [[MUL_RESULT3]]
+; LV-NEXT:    [[TMP13:%.*]] = bitcast i8* [[UGLYGEP]] to i16*
+; LV-NEXT:    [[TMP14:%.*]] = sub i64 [[MUL_RESULT3]], -4
+; LV-NEXT:    [[TMP15:%.*]] = sub i64 4, [[TMP14]]
+; LV-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 [[TMP15]]
+; LV-NEXT:    [[TMP16:%.*]] = bitcast i8* [[UGLYGEP6]] to i16*
+; LV-NEXT:    [[TMP17:%.*]] = icmp ugt i16* [[TMP16]], [[SCEVGEP]]
+; LV-NEXT:    [[TMP18:%.*]] = icmp ult i16* [[TMP13]], [[SCEVGEP]]
 ; LV-NEXT:    [[TMP19:%.*]] = select i1 true, i1 [[TMP17]], i1 [[TMP18]]
-; LV-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[MUL_OVERFLOW5]]
+; LV-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[MUL_OVERFLOW4]]
 ; LV-NEXT:    [[TMP21:%.*]] = or i1 [[TMP11]], [[TMP20]]
 ; LV-NEXT:    br i1 [[TMP21]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
 ; LV:       for.body.ph.lver.orig:
@@ -434,10 +449,10 @@ define void @f4(i16* noalias %a,
 ; LV-NEXT:    [[INC]] = add nuw nsw i64 [[IND]], 1
 ; LV-NEXT:    [[DEC]] = sub i32 [[IND1]], 1
 ; LV-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[N]]
-; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT7:%.*]], label [[FOR_BODY]]
 ; LV:       for.end.loopexit:
 ; LV-NEXT:    br label [[FOR_END:%.*]]
-; LV:       for.end.loopexit6:
+; LV:       for.end.loopexit7:
 ; LV-NEXT:    br label [[FOR_END]]
 ; LV:       for.end:
 ; LV-NEXT:    ret void
@@ -484,7 +499,6 @@ for.end:                                          ; preds = %for.body
 define void @f5(i16* noalias %a,
 ; LV-LABEL: @f5(
 ; LV-NEXT:  for.body.lver.check:
-; LV-NEXT:    [[A2:%.*]] = ptrtoint i16* [[A:%.*]] to i64
 ; LV-NEXT:    [[TRUNCN:%.*]] = trunc i64 [[N:%.*]] to i32
 ; LV-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
 ; LV-NEXT:    [[TMP1:%.*]] = shl i32 [[TRUNCN]], 1
@@ -502,17 +516,21 @@ define void @f5(i16* noalias %a,
 ; LV-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
 ; LV-NEXT:    [[TMP11:%.*]] = or i1 false, [[TMP10]]
 ; LV-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP1]] to i64
-; LV-NEXT:    [[TMP13:%.*]] = shl nsw i64 [[TMP12]], 1
-; LV-NEXT:    [[TMP14:%.*]] = add i64 [[A2]], [[TMP13]]
-; LV-NEXT:    [[MUL3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
-; LV-NEXT:    [[MUL_RESULT4:%.*]] = extractvalue { i64, i1 } [[MUL3]], 0
-; LV-NEXT:    [[MUL_OVERFLOW5:%.*]] = extractvalue { i64, i1 } [[MUL3]], 1
-; LV-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[MUL_RESULT4]]
-; LV-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[TMP16]], [[TMP14]]
-; LV-NEXT:    [[TMP18:%.*]] = icmp ult i64 [[TMP15]], [[TMP14]]
+; LV-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, i16* [[A:%.*]], i64 [[TMP12]]
+; LV-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
+; LV-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; LV-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; LV-NEXT:    [[SCEVGEP5:%.*]] = bitcast i16* [[SCEVGEP]] to i8*
+; LV-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 [[MUL_RESULT3]]
+; LV-NEXT:    [[TMP13:%.*]] = bitcast i8* [[UGLYGEP]] to i16*
+; LV-NEXT:    [[TMP14:%.*]] = sub i64 [[MUL_RESULT3]], -4
+; LV-NEXT:    [[TMP15:%.*]] = sub i64 4, [[TMP14]]
+; LV-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 [[TMP15]]
+; LV-NEXT:    [[TMP16:%.*]] = bitcast i8* [[UGLYGEP6]] to i16*
+; LV-NEXT:    [[TMP17:%.*]] = icmp ugt i16* [[TMP16]], [[SCEVGEP]]
+; LV-NEXT:    [[TMP18:%.*]] = icmp ult i16* [[TMP13]], [[SCEVGEP]]
 ; LV-NEXT:    [[TMP19:%.*]] = select i1 true, i1 [[TMP17]], i1 [[TMP18]]
-; LV-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[MUL_OVERFLOW5]]
+; LV-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[MUL_OVERFLOW4]]
 ; LV-NEXT:    [[TMP21:%.*]] = or i1 [[TMP11]], [[TMP20]]
 ; LV-NEXT:    br i1 [[TMP21]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
 ; LV:       for.body.ph.lver.orig:
@@ -546,10 +564,10 @@ define void @f5(i16* noalias %a,
 ; LV-NEXT:    [[INC]] = add nuw nsw i64 [[IND]], 1
 ; LV-NEXT:    [[DEC]] = sub i32 [[IND1]], 1
 ; LV-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[N]]
-; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT7:%.*]], label [[FOR_BODY]]
 ; LV:       for.end.loopexit:
 ; LV-NEXT:    br label [[FOR_END:%.*]]
-; LV:       for.end.loopexit6:
+; LV:       for.end.loopexit7:
 ; LV-NEXT:    br label [[FOR_END]]
 ; LV:       for.end:
 ; LV-NEXT:    ret void
-- 
2.7.4