From 57743883f11662433c7fe13bf4d59f0fca0e2d33 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Fri, 20 Jul 2018 12:03:00 +0000
Subject: [PATCH] Revert "[LSV] Refactoring + supporting bitcasts to a type of
 different size"

This reverts commit r337489.
It causes asserts to fire in some TensorFlow tests, e.g.
tensorflow/compiler/tests/gather_test.py on GPU.

Example stack trace:
Start test case: GatherTest.testHigherRank
assertion failed at third_party/llvm/llvm/lib/Support/APInt.cpp:819 in llvm::APInt llvm::APInt::trunc(unsigned int) const: width && "Can't truncate to 0 bits"
    @     0x5559446ebe10  __assert_fail
    @     0x55593ef32f5e  llvm::APInt::trunc()
    @     0x55593d78f86e  (anonymous namespace)::Vectorizer::lookThroughComplexAddresses()
    @     0x55593d78f2bc  (anonymous namespace)::Vectorizer::areConsecutivePointers()
    @     0x55593d78d128  (anonymous namespace)::Vectorizer::isConsecutiveAccess()
    @     0x55593d78c926  (anonymous namespace)::Vectorizer::vectorizeInstructions()
    @     0x55593d78c221  (anonymous namespace)::Vectorizer::vectorizeChains()
    @     0x55593d78b948  (anonymous namespace)::Vectorizer::run()
    @     0x55593d78b725  (anonymous namespace)::LoadStoreVectorizer::runOnFunction()
    @     0x55593edf4b17  llvm::FPPassManager::runOnFunction()
    @     0x55593edf4e55  llvm::FPPassManager::runOnModule()
    @     0x55593edf563c  (anonymous namespace)::MPPassManager::runOnModule()
    @     0x55593edf5137  llvm::legacy::PassManagerImpl::run()
    @     0x55593edf5b71  llvm::legacy::PassManager::run()
    @     0x55593ced250d  xla::gpu::IrDumpingPassManager::run()
    @     0x55593ced5033  xla::gpu::(anonymous namespace)::EmitModuleToPTX()
    @     0x55593ced40ba  xla::gpu::(anonymous namespace)::CompileModuleToPtx()
    @     0x55593ced33d0  xla::gpu::CompileToPtx()
    @     0x55593b26b2a2  xla::gpu::NVPTXCompiler::RunBackend()
    @     0x55593b21f973  xla::Service::BuildExecutable()
    @     0x555938f44e64  xla::LocalService::CompileExecutable()
    @     0x555938f30a85  xla::LocalClient::Compile()
    @     0x555938de3c29  tensorflow::XlaCompilationCache::BuildExecutable()
    @     0x555938de4e9e  tensorflow::XlaCompilationCache::CompileImpl()
    @     0x555938de3da5  tensorflow::XlaCompilationCache::Compile()
    @     0x555938c5d962  tensorflow::XlaLocalLaunchBase::Compute()
    @     0x555938c68151  tensorflow::XlaDevice::Compute()
    @     0x55593f389e1f  tensorflow::(anonymous namespace)::ExecutorState::Process()
    @     0x55593f38a625  tensorflow::(anonymous namespace)::ExecutorState::ScheduleReady()::$_1::operator()()
*** SIGABRT received by PID 7798 (TID 7837) from PID 7798; ***

llvm-svn: 337541
---
 .../Transforms/Vectorize/LoadStoreVectorizer.cpp   | 108 +++++++++------------
 llvm/test/CodeGen/X86/loadStore_vectorizer.ll      |  18 +---
 .../LoadStoreVectorizer/AMDGPU/gep-bitcast.ll      |  21 +---
 3 files changed, 50 insertions(+), 97 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 14b73e9..8ce4082 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -118,6 +118,8 @@ public:
   bool run();
 
 private:
+  GetElementPtrInst *getSourceGEP(Value *Src) const;
+
   unsigned getPointerAddressSpace(Value *I);
 
   unsigned getAlignment(LoadInst *LI) const {
@@ -137,8 +139,6 @@ private:
   }
 
   bool isConsecutiveAccess(Value *A, Value *B);
-  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size);
-  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta);
 
   /// After vectorization, reorder the instructions that I depends on
   /// (the instructions defining its operands), to ensure they dominate I.
@@ -277,6 +277,21 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
   return -1;
 }
 
+GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
+  // First strip pointer bitcasts. Make sure pointee size is the same with
+  // and without casts.
+  // TODO: a stride set by the add instruction below can match the difference
+  // in pointee type size here. Currently it will not be vectorized.
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *SrcBase = SrcPtr->stripPointerCasts();
+  Type *SrcPtrType = SrcPtr->getType()->getPointerElementType();
+  Type *SrcBaseType = SrcBase->getType()->getPointerElementType();
+  if (SrcPtrType->isSized() && SrcBaseType->isSized() &&
+      DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType))
+    SrcPtr = SrcBase;
+  return dyn_cast<GetElementPtrInst>(SrcPtr);
+}
+
 // FIXME: Merge with llvm::isConsecutiveAccess
 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   Value *PtrA = getLoadStorePointerOperand(A);
@@ -289,6 +304,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
     return false;
 
   // Make sure that A and B are different pointers of the same size type.
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   Type *PtrATy = PtrA->getType()->getPointerElementType();
   Type *PtrBTy = PtrB->getType()->getPointerElementType();
   if (PtrA == PtrB ||
@@ -298,16 +314,10 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
           DL.getTypeStoreSize(PtrBTy->getScalarType()))
     return false;
 
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
 
-  return areConsecutivePointers(PtrA, PtrB, Size);
-}
-
-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
-  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
-  APInt OffsetA(PtrBitWidth, 0);
-  APInt OffsetB(PtrBitWidth, 0);
+  unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
+  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
@@ -341,94 +351,68 @@ bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
   // Sometimes even this doesn't work, because SCEV can't always see through
   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
   // things the hard way.
-  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta);
-}
-
-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
-                                             APInt PtrDelta) {
-  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
-  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
-  if (!GEPA || !GEPB)
-    return false;
 
   // Look through GEPs after checking they're the same except for the last
   // index.
-  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
-      GEPA->getPointerOperand() != GEPB->getPointerOperand())
+  GetElementPtrInst *GEPA = getSourceGEP(A);
+  GetElementPtrInst *GEPB = getSourceGEP(B);
+  if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
     return false;
-  gep_type_iterator GTIA = gep_type_begin(GEPA);
-  gep_type_iterator GTIB = gep_type_begin(GEPB);
-  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
-    if (GTIA.getOperand() != GTIB.getOperand())
+  unsigned FinalIndex = GEPA->getNumOperands() - 1;
+  for (unsigned i = 0; i < FinalIndex; i++)
+    if (GEPA->getOperand(i) != GEPB->getOperand(i))
       return false;
-    ++GTIA;
-    ++GTIB;
-  }
 
-  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
-  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
+  Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
+  Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
       OpA->getType() != OpB->getType())
     return false;
 
-  if (PtrDelta.isNegative()) {
-    if (PtrDelta.isMinSignedValue())
-      return false;
-    PtrDelta.negate();
-    std::swap(OpA, OpB);
-  }
-  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
-  if (PtrDelta.urem(Stride) != 0)
-    return false;
-  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
-  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
-
   // Only look through a ZExt/SExt.
   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
     return false;
 
   bool Signed = isa<SExtInst>(OpA);
 
-  // At this point A could be a function parameter, i.e. not an instruction
-  Value *ValA = OpA->getOperand(0);
+  OpA = dyn_cast<Instruction>(OpA->getOperand(0));
   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
-  if (!OpB || ValA->getType() != OpB->getType())
+  if (!OpA || !OpB || OpA->getType() != OpB->getType())
     return false;
 
-  // Now we need to prove that adding IdxDiff to ValA won't overflow.
+  // Now we need to prove that adding 1 to OpA won't overflow.
   bool Safe = false;
-  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
-  // ValA, we're okay.
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
+  // we're okay.
   if (OpB->getOpcode() == Instruction::Add &&
       isa<ConstantInt>(OpB->getOperand(1)) &&
-      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
+      cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
     if (Signed)
       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
     else
       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
   }
 
-  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
+  unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
 
   // Second attempt:
-  // If all set bits of IdxDiff or any higher order bit other than the sign bit
-  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
-  // overflow of any sort.
+  // If any bits are known to be zero other than the sign bit in OpA, we can
+  // add 1 to it while guaranteeing no overflow of any sort.
   if (!Safe) {
-    OpA = dyn_cast<Instruction>(ValA);
-    if (!OpA)
-      return false;
     KnownBits Known(BitWidth);
     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
-    if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff))
-      return false;
+    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
+      Safe = true;
   }
 
-  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+  if (!Safe)
+    return false;
+
+  const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
-  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
-  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
-  return X == OffsetSCEVB;
+  const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
+  const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
+  return X2 == OffsetSCEVB;
 }
 
 void Vectorizer::reorder(Instruction *I) {
diff --git a/llvm/test/CodeGen/X86/loadStore_vectorizer.ll b/llvm/test/CodeGen/X86/loadStore_vectorizer.ll
index 48f3156..03f6ccc 100644
--- a/llvm/test/CodeGen/X86/loadStore_vectorizer.ll
+++ b/llvm/test/CodeGen/X86/loadStore_vectorizer.ll
@@ -1,9 +1,8 @@
-; RUN: opt -mtriple x86_64-- -load-store-vectorizer < %s -S | FileCheck %s
+; RUN: opt -load-store-vectorizer < %s -S | FileCheck %s
 
 %struct_render_pipeline_state = type opaque
 
-define fastcc void @test1(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
-; CHECK-LABEL: @test1
+define fastcc void @main(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
 ; CHECK: load i16
 ; CHECK: load i16
 entry:
@@ -15,16 +14,3 @@ entry:
   %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2
   ret void
 }
-
-define fastcc void @test2(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
-; CHECK-LABEL: @test2
-; CHECK: load <2 x i16>
-entry:
-  %tmp = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i16 addrspace(1)*
-  %tmp1 = load i16, i16 addrspace(1)* %tmp, align 2
-  %tmp2 = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i8 addrspace(1)*
-  %sunkaddr51 = getelementptr i8, i8 addrspace(1)* %tmp2, i64 2
-  %tmp3 = bitcast i8 addrspace(1)* %sunkaddr51 to i16 addrspace(1)*
-  %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
index a9c3fbf..b67dc05 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
@@ -56,8 +56,8 @@ define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba
   ret void
 }
 
+; TODO: This can be vectorized, but currently vectorizer unable to do it.
 ; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx
-; CHECK: load <4 x i32>
 define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
   %add1 = add nuw i32 %base, 0
   %zext1 = zext i32 %add1 to i64
@@ -74,27 +74,10 @@ define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba
   %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
   %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
   %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
-  %add4 = add nuw i32 %base, 12
+  %add4 = add nuw i32 %base, 16
   %zext4 = zext i32 %add4 to i64
   %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
   %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
   %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
   ret void
 }
-
-; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta
-; CHECK: load <2 x i32>
-define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) {
-  %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)*
-  %a.offset = add nuw i32 %base, 4
-  %t.offset.zexted = zext i32 %base to i64
-  %a.offset.zexted = zext i32 %a.offset to i64
-  %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted
-  %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted
-  %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6
-  %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)*
-  %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)*
-  %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted
-  %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted
-  ret void
-}
-- 
2.7.4