From 57743883f11662433c7fe13bf4d59f0fca0e2d33 Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Fri, 20 Jul 2018 12:03:00 +0000 Subject: [PATCH] Revert "[LSV] Refactoring + supporting bitcasts to a type of different size" This reverts commit r337489. It causes asserts to fire in some TensorFlow tests, e.g. tensorflow/compiler/tests/gather_test.py on GPU. Example stack trace: Start test case: GatherTest.testHigherRank assertion failed at third_party/llvm/llvm/lib/Support/APInt.cpp:819 in llvm::APInt llvm::APInt::trunc(unsigned int) const: width && "Can't truncate to 0 bits" @ 0x5559446ebe10 __assert_fail @ 0x55593ef32f5e llvm::APInt::trunc() @ 0x55593d78f86e (anonymous namespace)::Vectorizer::lookThroughComplexAddresses() @ 0x55593d78f2bc (anonymous namespace)::Vectorizer::areConsecutivePointers() @ 0x55593d78d128 (anonymous namespace)::Vectorizer::isConsecutiveAccess() @ 0x55593d78c926 (anonymous namespace)::Vectorizer::vectorizeInstructions() @ 0x55593d78c221 (anonymous namespace)::Vectorizer::vectorizeChains() @ 0x55593d78b948 (anonymous namespace)::Vectorizer::run() @ 0x55593d78b725 (anonymous namespace)::LoadStoreVectorizer::runOnFunction() @ 0x55593edf4b17 llvm::FPPassManager::runOnFunction() @ 0x55593edf4e55 llvm::FPPassManager::runOnModule() @ 0x55593edf563c (anonymous namespace)::MPPassManager::runOnModule() @ 0x55593edf5137 llvm::legacy::PassManagerImpl::run() @ 0x55593edf5b71 llvm::legacy::PassManager::run() @ 0x55593ced250d xla::gpu::IrDumpingPassManager::run() @ 0x55593ced5033 xla::gpu::(anonymous namespace)::EmitModuleToPTX() @ 0x55593ced40ba xla::gpu::(anonymous namespace)::CompileModuleToPtx() @ 0x55593ced33d0 xla::gpu::CompileToPtx() @ 0x55593b26b2a2 xla::gpu::NVPTXCompiler::RunBackend() @ 0x55593b21f973 xla::Service::BuildExecutable() @ 0x555938f44e64 xla::LocalService::CompileExecutable() @ 0x555938f30a85 xla::LocalClient::Compile() @ 0x555938de3c29 tensorflow::XlaCompilationCache::BuildExecutable() @ 0x555938de4e9e tensorflow::XlaCompilationCache::CompileImpl() @ 0x555938de3da5 tensorflow::XlaCompilationCache::Compile() @ 0x555938c5d962 tensorflow::XlaLocalLaunchBase::Compute() @ 0x555938c68151 tensorflow::XlaDevice::Compute() @ 0x55593f389e1f tensorflow::(anonymous namespace)::ExecutorState::Process() @ 0x55593f38a625 tensorflow::(anonymous namespace)::ExecutorState::ScheduleReady()::$_1::operator()() *** SIGABRT received by PID 7798 (TID 7837) from PID 7798; *** llvm-svn: 337541 --- .../Transforms/Vectorize/LoadStoreVectorizer.cpp | 108 +++++++++------------ llvm/test/CodeGen/X86/loadStore_vectorizer.ll | 18 +--- .../LoadStoreVectorizer/AMDGPU/gep-bitcast.ll | 21 +--- 3 files changed, 50 insertions(+), 97 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 14b73e9..8ce4082 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -118,6 +118,8 @@ public: bool run(); private: + GetElementPtrInst *getSourceGEP(Value *Src) const; + unsigned getPointerAddressSpace(Value *I); unsigned getAlignment(LoadInst *LI) const { @@ -137,8 +139,6 @@ private: } bool isConsecutiveAccess(Value *A, Value *B); - bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size); - bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta); /// After vectorization, reorder the instructions that I depends on /// (the instructions defining its operands), to ensure they dominate I. @@ -277,6 +277,21 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) { return -1; } +GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const { + // First strip pointer bitcasts. Make sure pointee size is the same with + // and without casts. + // TODO: a stride set by the add instruction below can match the difference + // in pointee type size here. Currently it will not be vectorized. + Value *SrcPtr = getLoadStorePointerOperand(Src); + Value *SrcBase = SrcPtr->stripPointerCasts(); + Type *SrcPtrType = SrcPtr->getType()->getPointerElementType(); + Type *SrcBaseType = SrcBase->getType()->getPointerElementType(); + if (SrcPtrType->isSized() && SrcBaseType->isSized() && + DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType)) + SrcPtr = SrcBase; + return dyn_cast(SrcPtr); +} + // FIXME: Merge with llvm::isConsecutiveAccess bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getLoadStorePointerOperand(A); @@ -289,6 +304,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { return false; // Make sure that A and B are different pointers of the same size type. + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *PtrATy = PtrA->getType()->getPointerElementType(); Type *PtrBTy = PtrB->getType()->getPointerElementType(); if (PtrA == PtrB || @@ -298,16 +314,10 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { DL.getTypeStoreSize(PtrBTy->getScalarType())) return false; - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); - return areConsecutivePointers(PtrA, PtrB, Size); -} - -bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) { - unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); - APInt OffsetA(PtrBitWidth, 0); - APInt OffsetB(PtrBitWidth, 0); + unsigned IdxWidth = DL.getIndexSizeInBits(ASA); + APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); @@ -341,94 +351,68 @@ bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) { // Sometimes even this doesn't work, because SCEV can't always see through // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking // things the hard way. - return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta); -} - -bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, - APInt PtrDelta) { - auto *GEPA = dyn_cast(PtrA); - auto *GEPB = dyn_cast(PtrB); - if (!GEPA || !GEPB) - return false; // Look through GEPs after checking they're the same except for the last // index. - if (GEPA->getNumOperands() != GEPB->getNumOperands() || - GEPA->getPointerOperand() != GEPB->getPointerOperand()) + GetElementPtrInst *GEPA = getSourceGEP(A); + GetElementPtrInst *GEPB = getSourceGEP(B); + if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) return false; - gep_type_iterator GTIA = gep_type_begin(GEPA); - gep_type_iterator GTIB = gep_type_begin(GEPB); - for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { - if (GTIA.getOperand() != GTIB.getOperand()) + unsigned FinalIndex = GEPA->getNumOperands() - 1; + for (unsigned i = 0; i < FinalIndex; i++) + if (GEPA->getOperand(i) != GEPB->getOperand(i)) return false; - ++GTIA; - ++GTIB; - } - Instruction *OpA = dyn_cast(GTIA.getOperand()); - Instruction *OpB = dyn_cast(GTIB.getOperand()); + Instruction *OpA = dyn_cast(GEPA->getOperand(FinalIndex)); + Instruction *OpB = dyn_cast(GEPB->getOperand(FinalIndex)); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) return false; - if (PtrDelta.isNegative()) { - if (PtrDelta.isMinSignedValue()) - return false; - PtrDelta.negate(); - std::swap(OpA, OpB); - } - uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); - if (PtrDelta.urem(Stride) != 0) - return false; - unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); - APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); - // Only look through a ZExt/SExt. if (!isa(OpA) && !isa(OpA)) return false; bool Signed = isa(OpA); - // At this point A could be a function parameter, i.e. not an instruction - Value *ValA = OpA->getOperand(0); + OpA = dyn_cast(OpA->getOperand(0)); OpB = dyn_cast(OpB->getOperand(0)); - if (!OpB || ValA->getType() != OpB->getType()) + if (!OpA || !OpB || OpA->getType() != OpB->getType()) return false; - // Now we need to prove that adding IdxDiff to ValA won't overflow. + // Now we need to prove that adding 1 to OpA won't overflow. bool Safe = false; - // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to - // ValA, we're okay. + // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, + // we're okay. if (OpB->getOpcode() == Instruction::Add && isa(OpB->getOperand(1)) && - IdxDiff.sle(cast(OpB->getOperand(1))->getSExtValue())) { + cast(OpB->getOperand(1))->getSExtValue() > 0) { if (Signed) Safe = cast(OpB)->hasNoSignedWrap(); else Safe = cast(OpB)->hasNoUnsignedWrap(); } - unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); + unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); // Second attempt: - // If all set bits of IdxDiff or any higher order bit other than the sign bit - // are known to be zero in ValA, we can add Diff to it while guaranteeing no - // overflow of any sort. + // If any bits are known to be zero other than the sign bit in OpA, we can + // add 1 to it while guaranteeing no overflow of any sort. if (!Safe) { - OpA = dyn_cast(ValA); - if (!OpA) - return false; KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff)) - return false; + if (Known.countMaxTrailingOnes() < (BitWidth - 1)) + Safe = true; } - const SCEV *OffsetSCEVA = SE.getSCEV(ValA); + if (!Safe) + return false; + + const SCEV *OffsetSCEVA = SE.getSCEV(OpA); const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); - const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); - return X == OffsetSCEVB; + const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); + const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); + return X2 == OffsetSCEVB; } void Vectorizer::reorder(Instruction *I) { diff --git a/llvm/test/CodeGen/X86/loadStore_vectorizer.ll b/llvm/test/CodeGen/X86/loadStore_vectorizer.ll index 48f3156..03f6ccc 100644 --- a/llvm/test/CodeGen/X86/loadStore_vectorizer.ll +++ b/llvm/test/CodeGen/X86/loadStore_vectorizer.ll @@ -1,9 +1,8 @@ -; RUN: opt -mtriple x86_64-- -load-store-vectorizer < %s -S | FileCheck %s +; RUN: opt -load-store-vectorizer < %s -S | FileCheck %s %struct_render_pipeline_state = type opaque -define fastcc void @test1(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { -; CHECK-LABEL: @test1 +define fastcc void @main(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { ; CHECK: load i16 ; CHECK: load i16 entry: @@ -15,16 +14,3 @@ entry: %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2 ret void } - -define fastcc void @test2(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { -; CHECK-LABEL: @test2 -; CHECK: load <2 x i16> -entry: - %tmp = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i16 addrspace(1)* - %tmp1 = load i16, i16 addrspace(1)* %tmp, align 2 - %tmp2 = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i8 addrspace(1)* - %sunkaddr51 = getelementptr i8, i8 addrspace(1)* %tmp2, i64 2 - %tmp3 = bitcast i8 addrspace(1)* %sunkaddr51 to i16 addrspace(1)* - %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2 - ret void -} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll index a9c3fbf..b67dc05 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll @@ -56,8 +56,8 @@ define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba ret void } +; TODO: This can be vectorized, but currently vectorizer unable to do it. ; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx -; CHECK: load <4 x i32> define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { %add1 = add nuw i32 %base, 0 %zext1 = zext i32 %add1 to i64 @@ -74,27 +74,10 @@ define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 - %add4 = add nuw i32 %base, 12 + %add4 = add nuw i32 %base, 16 %zext4 = zext i32 %add4 to i64 %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 ret void } - -; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta -; CHECK: load <2 x i32> -define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) { - %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)* - %a.offset = add nuw i32 %base, 4 - %t.offset.zexted = zext i32 %base to i64 - %a.offset.zexted = zext i32 %a.offset to i64 - %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted - %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted - %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6 - %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)* - %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)* - %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted - %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted - ret void -} -- 2.7.4