namespace llvm {
namespace VNCoercion {
+static bool isFirstClassAggregateOrScalableType(Type *Ty) {
+ return Ty->isStructTy() || Ty->isArrayTy() ||
+ (Ty->isVectorTy() && Ty->getVectorIsScalable());
+}
+
/// Return true if coerceAvailableValueToLoadType will succeed.
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
const DataLayout &DL) {
if (StoredTy == LoadTy)
return true;
- // If the loaded or stored value is an first class array or struct, don't try
- // to transform them. We need to be able to bitcast to integer.
- if (LoadTy->isStructTy() || LoadTy->isArrayTy() || StoredTy->isStructTy() ||
- StoredTy->isArrayTy())
+ // If the loaded/stored value is a first class array/struct, or scalable type,
+ // don't try to transform them. We need to be able to bitcast to integer.
+ if (isFirstClassAggregateOrScalableType(LoadTy) ||
+ isFirstClassAggregateOrScalableType(StoredTy))
return false;
- uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy);
+ uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
// The store size must be byte-aligned to support future type casts.
if (llvm::alignTo(StoreSize, 8) != StoreSize)
return false;
// The store has to be at least as big as the load.
- if (StoreSize < DL.getTypeSizeInBits(LoadTy))
+ if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
return false;
// Don't coerce non-integral pointers to integers or vice versa.
// If this is already the right type, just return it.
Type *StoredValTy = StoredVal->getType();
- uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
- uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+ uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
+ uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
// If the store and reload are the same size, we can always reuse it.
if (StoredValSize == LoadedValSize) {
// If this is a big-endian system, we need to shift the value down to the low
// bits so that a truncate will work.
if (DL.isBigEndian()) {
- uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
- DL.getTypeStoreSizeInBits(LoadedTy);
+ uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
+ DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
StoredVal = Helper.CreateLShr(
StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
}
Value *WritePtr,
uint64_t WriteSizeInBits,
const DataLayout &DL) {
- // If the loaded or stored value is a first class array or struct, don't try
- // to transform them. We need to be able to bitcast to integer.
- if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+ // If the loaded/stored value is a first class array/struct, or scalable type,
+ // don't try to transform them. We need to be able to bitcast to integer.
+ if (isFirstClassAggregateOrScalableType(LoadTy))
return -1;
int64_t StoreOffset = 0, LoadOffset = 0;
// If the load and store don't overlap at all, the store doesn't provide
// anything to the load. In this case, they really don't alias at all, AA
// must have gotten confused.
- uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
if ((WriteSizeInBits & 7) | (LoadSize & 7))
return -1;
int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
StoreInst *DepSI, const DataLayout &DL) {
auto *StoredVal = DepSI->getValueOperand();
-
- // Cannot handle reading from store of first-class aggregate yet.
- if (StoredVal->getType()->isStructTy() ||
- StoredVal->getType()->isArrayTy())
+
+ // Cannot handle reading from store of first-class aggregate or scalable type.
+ if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
return -1;
// Don't coerce non-integral pointers to integers or vice versa.
Value *StorePtr = DepSI->getPointerOperand();
uint64_t StoreSize =
- DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+ DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
DL);
}
return -1;
Value *DepPtr = DepLI->getPointerOperand();
- uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+ uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
if (R != -1)
return R;
int64_t LoadOffs = 0;
const Value *LoadBase =
GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
unsigned Size =
getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
return SrcVal;
}
- uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
- uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+ uint64_t StoreSize =
+ (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
+ uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
// Compute which bits of the stored value are being used by the load. Convert
// to an integer type to start with.
if (SrcVal->getType()->isPtrOrPtrVectorTy())
Instruction *InsertPt, const DataLayout &DL) {
// If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
// widen SrcVal out to a larger load.
- unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+ unsigned SrcValStoreSize =
+ DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
if (Offset + LoadSize > SrcValStoreSize) {
assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
Type *LoadTy, const DataLayout &DL) {
- unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+ unsigned SrcValStoreSize =
+ DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
if (Offset + LoadSize > SrcValStoreSize)
return nullptr;
return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
Type *LoadTy, HelperClass &Helper,
const DataLayout &DL) {
LLVMContext &Ctx = LoadTy->getContext();
- uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
// We know that this method is only called when the mem transfer fully
// provides the bits for the load.
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -basicaa -gvn -dce | FileCheck %s
+
+; Analyze Load from clobbering Load.
+
+define <vscale x 4 x i32> @load_store_clobber_load(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @load_store_clobber_load(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* undef
+; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+ %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* undef
+ %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; <- load to be eliminated
+ %add = add <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_mayalias(<vscale x 4 x i32>* %p, <vscale x 4 x i32>* %p2) {
+; CHECK-LABEL: @load_store_clobber_load_mayalias(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P2:%.*]]
+; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[SUB]]
+;
+ %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p2
+ %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ %sub = sub <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_noalias(<vscale x 4 x i32>* noalias %p, <vscale x 4 x i32>* noalias %p2) {
+; CHECK-LABEL: @load_store_clobber_load_noalias(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P2:%.*]]
+; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+ %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p2
+ %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; <- load to be eliminated
+ %add = add <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %add
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep1(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep1(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT: [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to i32*
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[P2]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load i32, i32* [[GEP2]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 0, i64 1
+ %load1 = load i32, i32* %gep1
+ %p2 = bitcast <vscale x 4 x i32>* %p to i32*
+ %gep2 = getelementptr i32, i32* %p2, i64 1
+ %load2 = load i32, i32* %gep2 ; <- load could be eliminated
+ %add = add i32 %load1, %load2
+ ret i32 %add
+}
+
+define i32 @load_clobber_load_gep2(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep2(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT: [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to i32*
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[P2]], i64 4
+; CHECK-NEXT: [[LOAD2:%.*]] = load i32, i32* [[GEP2]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+ %load1 = load i32, i32* %gep1
+ %p2 = bitcast <vscale x 4 x i32>* %p to i32*
+ %gep2 = getelementptr i32, i32* %p2, i64 4
+ %load2 = load i32, i32* %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr
+ %add = add i32 %load1, %load2
+ ret i32 %add
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep3(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep3(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT: [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to <vscale x 4 x float>*
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* [[P2]], i64 1, i64 0
+; CHECK-NEXT: [[LOAD2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT: [[CAST:%.*]] = bitcast float [[LOAD2]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[CAST]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+ %load1 = load i32, i32* %gep1
+ %p2 = bitcast <vscale x 4 x i32>* %p to <vscale x 4 x float>*
+ %gep2 = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %p2, i64 1, i64 0
+ %load2 = load float, float* %gep2 ; <- load could be eliminated
+ %cast = bitcast float %load2 to i32
+ %add = add i32 %load1, %cast
+ ret i32 %add
+}
+
+define <vscale x 4 x i32> @load_clobber_load_fence(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_fence(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: call void asm "", "~{memory}"()
+; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[SUB]]
+;
+ %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ call void asm "", "~{memory}"()
+ %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ %sub = sub <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_clobber_load_sideeffect(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_sideeffect(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: call void asm sideeffect "", ""()
+; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+ %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ call void asm sideeffect "", ""()
+ %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ %add = add <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %add
+}
+
+; Analyze Load from clobbering Store.
+
+define <vscale x 4 x i32> @store_forward_to_load(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @store_forward_to_load(
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+ %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @store_forward_to_load_sideeffect(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @store_forward_to_load_sideeffect(
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: call void asm sideeffect "", ""()
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]]
+;
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+ call void asm sideeffect "", ""()
+ %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+ ret <vscale x 4 x i32> %load
+}
+
+define i32 @store_clobber_load() {
+; CHECK-LABEL: @store_clobber_load(
+; CHECK-NEXT: [[ALLOC:%.*]] = alloca <vscale x 4 x i32>
+; CHECK-NEXT: store <vscale x 4 x i32> undef, <vscale x 4 x i32>* [[ALLOC]]
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[ALLOC]], i32 0, i32 1
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[PTR]]
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+ %alloc = alloca <vscale x 4 x i32>
+ store <vscale x 4 x i32> undef, <vscale x 4 x i32>* %alloc
+ %ptr = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %alloc, i32 0, i32 1
+ %load = load i32, i32* %ptr
+ ret i32 %load
+}
+
+; Analyze Load from clobbering MemInst.
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+define i32 @memset_clobber_load(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load(
+; CHECK-NEXT: [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT: ret i32 16843009
+;
+ %conv = bitcast <vscale x 4 x i32>* %p to i8*
+ tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+ %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 0, i64 5
+ %load = load i32, i32* %gep
+ ret i32 %load
+}
+
+define i32 @memset_clobber_load_vscaled_base(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load_vscaled_base(
+; CHECK-NEXT: [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1, i64 1
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+ %conv = bitcast <vscale x 4 x i32>* %p to i8*
+ tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+ %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+ %load = load i32, i32* %gep
+ ret i32 %load
+}
+
+define i32 @memset_clobber_load_nonconst_index(<vscale x 4 x i32> *%p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @memset_clobber_load_nonconst_index(
+; CHECK-NEXT: [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 [[IDX1:%.*]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+ %conv = bitcast <vscale x 4 x i32>* %p to i8*
+ tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+ %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 %idx1, i64 %idx2
+ %load = load i32, i32* %gep
+ ret i32 %load
+}
+
+
+; Load elimination across BBs
+
+define <vscale x 4 x i32>* @load_from_alloc_replaced_with_undef() {
+; CHECK-LABEL: @load_from_alloc_replaced_with_undef(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca <vscale x 4 x i32>
+; CHECK-NEXT: br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[A]]
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret <vscale x 4 x i32>* [[A]]
+;
+entry:
+ %a = alloca <vscale x 4 x i32>
+ %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %a, i64 0, i64 1
+ %load = load i32, i32* %gep ; <- load to be eliminated
+ %tobool = icmp eq i32 %load, 0 ; <- icmp to be eliminated
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %a
+ br label %if.end
+
+if.end:
+ ret <vscale x 4 x i32>* %a
+}
+
+define i32 @redundant_load_elimination_1(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @redundant_load_elimination_1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 1
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret i32 [[LOAD1]]
+;
+entry:
+ %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+ %load1 = load i32, i32* %gep
+ %cmp = icmp eq i32 %load1, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %load2 = load i32, i32* %gep ; <- load to be eliminated
+ %add = add i32 %load1, %load2
+ br label %if.end
+
+if.end:
+ %result = phi i32 [ %add, %if.then ], [ %load1, %entry ]
+ ret i32 %result
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as NoAlias.
+define void @redundant_load_elimination_2(i1 %c, <vscale x 4 x i32>* %p, i32* %q, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @redundant_load_elimination_2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 1
+; CHECK-NEXT: store i32 0, i32* [[GEP1]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1, i64 0
+; CHECK-NEXT: store i32 1, i32* [[GEP2]]
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[T:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT: store i32 [[T]], i32* [[Q:%.*]]
+; CHECK-NEXT: ret void
+; CHECK: if.else:
+; CHECK-NEXT: ret void
+;
+entry:
+ %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+ store i32 0, i32* %gep1
+ %gep2 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+ store i32 1, i32* %gep2
+ br i1 %c, label %if.else, label %if.then
+
+if.then:
+ %t = load i32, i32* %gep1 ; <- load could be eliminated
+ store i32 %t, i32* %q
+ ret void
+
+if.else:
+ ret void
+}
+
+; TODO: load in if.then could have been eliminated
+define void @missing_load_elimination(i1 %c, <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %q, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @missing_load_elimination(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT: [[P1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1
+; CHECK-NEXT: store <vscale x 4 x i32> [[V:%.*]], <vscale x 4 x i32>* [[P1]]
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[T:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT: store <vscale x 4 x i32> [[T]], <vscale x 4 x i32>* [[Q:%.*]]
+; CHECK-NEXT: ret void
+; CHECK: if.else:
+; CHECK-NEXT: ret void
+;
+entry:
+ store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+ %p1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1
+ store <vscale x 4 x i32> %v, <vscale x 4 x i32>* %p1
+ br i1 %c, label %if.else, label %if.then
+
+if.then:
+ %t = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; load could be eliminated
+ store <vscale x 4 x i32> %t, <vscale x 4 x i32>* %q
+ ret void
+
+if.else:
+ ret void
+}