[LSV] Refactoring + supporting bitcasts to a type of different size

author Roman Tereshin <rtereshin@apple.com>

Thu, 19 Jul 2018 19:42:43 +0000 (19:42 +0000)

committer Roman Tereshin <rtereshin@apple.com>

Thu, 19 Jul 2018 19:42:43 +0000 (19:42 +0000)
author Roman Tereshin <rtereshin@apple.com>
Thu, 19 Jul 2018 19:42:43 +0000 (19:42 +0000)
committer Roman Tereshin <rtereshin@apple.com>
Thu, 19 Jul 2018 19:42:43 +0000 (19:42 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

index 8ce4082..14b73e9 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -118,8 +118,6 @@ public:
    bool run();
  
  private:
-  GetElementPtrInst *getSourceGEP(Value *Src) const;
-
    unsigned getPointerAddressSpace(Value *I);
  
    unsigned getAlignment(LoadInst *LI) const {
@@ -139,6 +137,8 @@ private:
    }
  
    bool isConsecutiveAccess(Value *A, Value *B);
+  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size);
+  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta);
  
    /// After vectorization, reorder the instructions that I depends on
    /// (the instructions defining its operands), to ensure they dominate I.
@@ -277,21 +277,6 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
    return -1;
  }
  
-GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
-  // First strip pointer bitcasts. Make sure pointee size is the same with
-  // and without casts.
-  // TODO: a stride set by the add instruction below can match the difference
-  // in pointee type size here. Currently it will not be vectorized.
-  Value *SrcPtr = getLoadStorePointerOperand(Src);
-  Value *SrcBase = SrcPtr->stripPointerCasts();
-  Type *SrcPtrType = SrcPtr->getType()->getPointerElementType();
-  Type *SrcBaseType = SrcBase->getType()->getPointerElementType();
-  if (SrcPtrType->isSized() && SrcBaseType->isSized() &&
-      DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType))
-    SrcPtr = SrcBase;
-  return dyn_cast<GetElementPtrInst>(SrcPtr);
-}
-
  // FIXME: Merge with llvm::isConsecutiveAccess
  bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
    Value *PtrA = getLoadStorePointerOperand(A);
@@ -304,7 +289,6 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
      return false;
  
    // Make sure that A and B are different pointers of the same size type.
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
    Type *PtrATy = PtrA->getType()->getPointerElementType();
    Type *PtrBTy = PtrB->getType()->getPointerElementType();
    if (PtrA == PtrB ||
@@ -314,10 +298,16 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
            DL.getTypeStoreSize(PtrBTy->getScalarType()))
      return false;
  
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
    APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
  
-  unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
-  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
+  return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
+  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+  APInt OffsetA(PtrBitWidth, 0);
+  APInt OffsetB(PtrBitWidth, 0);
    PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
    PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
  
@@ -351,68 +341,94 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
    // Sometimes even this doesn't work, because SCEV can't always see through
    // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
    // things the hard way.
+  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+                                             APInt PtrDelta) {
+  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+  if (!GEPA || !GEPB)
+    return false;
  
    // Look through GEPs after checking they're the same except for the last
    // index.
-  GetElementPtrInst *GEPA = getSourceGEP(A);
-  GetElementPtrInst *GEPB = getSourceGEP(B);
-  if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+      GEPA->getPointerOperand() != GEPB->getPointerOperand())
      return false;
-  unsigned FinalIndex = GEPA->getNumOperands() - 1;
-  for (unsigned i = 0; i < FinalIndex; i++)
-    if (GEPA->getOperand(i) != GEPB->getOperand(i))
+  gep_type_iterator GTIA = gep_type_begin(GEPA);
+  gep_type_iterator GTIB = gep_type_begin(GEPB);
+  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+    if (GTIA.getOperand() != GTIB.getOperand())
        return false;
+    ++GTIA;
+    ++GTIB;
+  }
  
-  Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
-  Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
    if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
        OpA->getType() != OpB->getType())
      return false;
  
+  if (PtrDelta.isNegative()) {
+    if (PtrDelta.isMinSignedValue())
+      return false;
+    PtrDelta.negate();
+    std::swap(OpA, OpB);
+  }
+  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+  if (PtrDelta.urem(Stride) != 0)
+    return false;
+  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
    // Only look through a ZExt/SExt.
    if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
      return false;
  
    bool Signed = isa<SExtInst>(OpA);
  
-  OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+  // At this point A could be a function parameter, i.e. not an instruction
+  Value *ValA = OpA->getOperand(0);
    OpB = dyn_cast<Instruction>(OpB->getOperand(0));
-  if (!OpA || !OpB || OpA->getType() != OpB->getType())
+  if (!OpB || ValA->getType() != OpB->getType())
      return false;
  
-  // Now we need to prove that adding 1 to OpA won't overflow.
+  // Now we need to prove that adding IdxDiff to ValA won't overflow.
    bool Safe = false;
-  // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
-  // we're okay.
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+  // ValA, we're okay.
    if (OpB->getOpcode() == Instruction::Add &&
        isa<ConstantInt>(OpB->getOperand(1)) &&
-      cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
      if (Signed)
        Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
      else
        Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
    }
  
-  unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
  
    // Second attempt:
-  // If any bits are known to be zero other than the sign bit in OpA, we can
-  // add 1 to it while guaranteeing no overflow of any sort.
+  // If all set bits of IdxDiff or any higher order bit other than the sign bit
+  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+  // overflow of any sort.
    if (!Safe) {
+    OpA = dyn_cast<Instruction>(ValA);
+    if (!OpA)
+      return false;
      KnownBits Known(BitWidth);
      computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
-    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
-      Safe = true;
+    if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff))
+      return false;
    }
  
-  if (!Safe)
-    return false;
-
-  const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
+  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
    const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
-  const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
-  const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
-  return X2 == OffsetSCEVB;
+  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+  return X == OffsetSCEVB;
  }
  
  void Vectorizer::reorder(Instruction *I) {
diff --git a/llvm/test/CodeGen/X86/loadStore_vectorizer.ll b/llvm/test/CodeGen/X86/loadStore_vectorizer.ll

index 03f6ccc..48f3156 100644 (file)
--- a/llvm/test/CodeGen/X86/loadStore_vectorizer.ll
+++ b/llvm/test/CodeGen/X86/loadStore_vectorizer.ll
@@ -1,8 +1,9 @@
-; RUN: opt -load-store-vectorizer < %s -S | FileCheck %s
+; RUN: opt -mtriple x86_64-- -load-store-vectorizer < %s -S | FileCheck %s
  
  %struct_render_pipeline_state = type opaque
  
-define fastcc void @main(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
+define fastcc void @test1(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
+; CHECK-LABEL: @test1
  ; CHECK: load i16
  ; CHECK: load i16
  entry:
@@ -14,3 +15,16 @@ entry:
    %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2
    ret void
  }
+
+define fastcc void @test2(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
+; CHECK-LABEL: @test2
+; CHECK: load <2 x i16>
+entry:
+  %tmp = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i16 addrspace(1)*
+  %tmp1 = load i16, i16 addrspace(1)* %tmp, align 2
+  %tmp2 = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i8 addrspace(1)*
+  %sunkaddr51 = getelementptr i8, i8 addrspace(1)* %tmp2, i64 2
+  %tmp3 = bitcast i8 addrspace(1)* %sunkaddr51 to i16 addrspace(1)*
+  %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll

index b67dc05..a9c3fbf 100644 (file)
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
@@ -56,8 +56,8 @@ define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba
    ret void
  }
  
-; TODO: This can be vectorized, but currently vectorizer unable to do it.
  ; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx
+; CHECK: load <4 x i32>
  define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
    %add1 = add nuw i32 %base, 0
    %zext1 = zext i32 %add1 to i64
@@ -74,10 +74,27 @@ define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba
    %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
    %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
    %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
-  %add4 = add nuw i32 %base, 16
+  %add4 = add nuw i32 %base, 12
    %zext4 = zext i32 %add4 to i64
    %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
    %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
    %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
    ret void
  }
+
+; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta
+; CHECK: load <2 x i32>
+define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) {
+  %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)*
+  %a.offset = add nuw i32 %base, 4
+  %t.offset.zexted = zext i32 %base to i64
+  %a.offset.zexted = zext i32 %a.offset to i64
+  %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted
+  %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted
+  %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6
+  %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)*
+  %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)*
+  %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted
+  %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted
+  ret void
+}
author	Roman Tereshin <rtereshin@apple.com>
	Thu, 19 Jul 2018 19:42:43 +0000 (19:42 +0000)
committer	Roman Tereshin <rtereshin@apple.com>
	Thu, 19 Jul 2018 19:42:43 +0000 (19:42 +0000)
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/loadStore_vectorizer.ll		patch \| blob \| history
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll		patch \| blob \| history