/// Helper struct, will support ranges eventually.
struct OffsetInfo {
- int64_t Offset = OffsetAndSize::Unknown;
+ int64_t Offset = OffsetAndSize::Unassigned;
bool operator==(const OffsetInfo &OI) const { return Offset == OI.Offset; }
};
auto HandlePassthroughUser = [&](Value *Usr, OffsetInfo PtrOI,
bool &Follow) {
+ assert(PtrOI.Offset != OffsetAndSize::Unassigned &&
+ "Cannot pass through if the input Ptr was not visited!");
OffsetInfo &UsrOI = OffsetInfoMap[Usr];
UsrOI = PtrOI;
Follow = true;
APInt GEPOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
if (PtrOI.Offset == OffsetAndSize::Unknown ||
!GEP->accumulateConstantOffset(DL, GEPOffset)) {
+ LLVM_DEBUG(dbgs() << "[AAPointerInfo] GEP offset not constant "
+ << *GEP << "\n");
UsrOI.Offset = OffsetAndSize::Unknown;
Follow = true;
return true;
}
+ LLVM_DEBUG(dbgs() << "[AAPointerInfo] GEP offset is constant " << *GEP
+ << "\n");
UsrOI.Offset = PtrOI.Offset + GEPOffset.getZExtValue();
Follow = true;
return true;
bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
OffsetInfo &UsrOI = OffsetInfoMap[Usr];
OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
- // Check if the PHI is invariant (so far).
- if (UsrOI == PtrOI)
- return true;
// Check if the PHI operand has already an unknown offset as we can't
// improve on that anymore.
if (PtrOI.Offset == OffsetAndSize::Unknown) {
+ LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI operand offset unknown "
+ << *CurPtr << " in " << *Usr << "\n");
+ Follow = UsrOI.Offset != OffsetAndSize::Unknown;
UsrOI = PtrOI;
- Follow = true;
+ return true;
+ }
+
+ // Check if the PHI is invariant (so far).
+ if (UsrOI == PtrOI) {
+ assert(PtrOI.Offset != OffsetAndSize::Unassigned &&
+ "Cannot assign if the current Ptr was not visited!");
+ LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI is invariant (so far)");
return true;
}
--- /dev/null
+; RUN: llc < %s | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; The call to intrinsic implicitarg_ptr reaches a load through a phi. The
+; offsets of the phi cannot be determined, and hence the attirbutor assumes that
+; hostcall is in use.
+
+; CHECK: .value_kind: hidden_hostcall_buffer
+; CHECK: .value_kind: hidden_multigrid_sync_arg
+
+define amdgpu_kernel void @the_kernel(i32 addrspace(1)* %a, i64 %index1, i64 %index2, i1 %cond) {
+entry:
+ %tmp7 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ br i1 %cond, label %old, label %new
+
+old: ; preds = %entry
+ %tmp4 = getelementptr i8, i8 addrspace(4)* %tmp7, i64 %index1
+ br label %join
+
+new: ; preds = %entry
+ %tmp12 = getelementptr inbounds i8, i8 addrspace(4)* %tmp7, i64 %index2
+ br label %join
+
+join: ; preds = %new, %old
+ %.in.in.in = phi i8 addrspace(4)* [ %tmp12, %new ], [ %tmp4, %old ]
+ %.in.in = bitcast i8 addrspace(4)* %.in.in.in to i16 addrspace(4)*
+
+ ;;; THIS USE is where the offset into implicitarg_ptr is unknown
+ %.in = load i16, i16 addrspace(4)* %.in.in, align 2
+
+ %idx.ext = sext i16 %.in to i64
+ %add.ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
+ %tmp16 = atomicrmw add i32 addrspace(1)* %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
ret i32 %add1
}
+define i8 @local_alloca_not_simplifiable_2(i64 %index1, i64 %index2, i1 %cnd) {
+; TUNIT: Function Attrs: nofree norecurse nosync nounwind willreturn
+; TUNIT-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_2
+; TUNIT-SAME: (i64 [[INDEX1:%.*]], i64 [[INDEX2:%.*]], i1 [[CND:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT: entry:
+; TUNIT-NEXT: [[BYTES:%.*]] = alloca [1024 x i8], align 16
+; TUNIT-NEXT: [[GEP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0
+; TUNIT-NEXT: store i8 7, i8* [[GEP0]], align 16
+; TUNIT-NEXT: br i1 [[CND]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
+; TUNIT: left:
+; TUNIT-NEXT: [[GEP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX1]]
+; TUNIT-NEXT: br label [[JOIN:%.*]]
+; TUNIT: right:
+; TUNIT-NEXT: [[GEP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX2]]
+; TUNIT-NEXT: br label [[JOIN]]
+; TUNIT: join:
+; TUNIT-NEXT: [[GEP_JOIN:%.*]] = phi i8* [ [[GEP1]], [[LEFT]] ], [ [[GEP2]], [[RIGHT]] ]
+; TUNIT-NEXT: store i8 9, i8* [[GEP_JOIN]], align 4
+; TUNIT-NEXT: [[I:%.*]] = load i8, i8* [[GEP0]], align 16
+; TUNIT-NEXT: ret i8 [[I]]
+;
+; CGSCC: Function Attrs: nofree norecurse nosync nounwind willreturn
+; CGSCC-LABEL: define {{[^@]+}}@local_alloca_not_simplifiable_2
+; CGSCC-SAME: (i64 [[INDEX1:%.*]], i64 [[INDEX2:%.*]], i1 [[CND:%.*]]) #[[ATTR5]] {
+; CGSCC-NEXT: entry:
+; CGSCC-NEXT: [[BYTES:%.*]] = alloca [1024 x i8], align 16
+; CGSCC-NEXT: [[GEP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0
+; CGSCC-NEXT: store i8 7, i8* [[GEP0]], align 16
+; CGSCC-NEXT: br i1 [[CND]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
+; CGSCC: left:
+; CGSCC-NEXT: [[GEP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX1]]
+; CGSCC-NEXT: br label [[JOIN:%.*]]
+; CGSCC: right:
+; CGSCC-NEXT: [[GEP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 [[INDEX2]]
+; CGSCC-NEXT: br label [[JOIN]]
+; CGSCC: join:
+; CGSCC-NEXT: [[GEP_JOIN:%.*]] = phi i8* [ [[GEP1]], [[LEFT]] ], [ [[GEP2]], [[RIGHT]] ]
+; CGSCC-NEXT: store i8 9, i8* [[GEP_JOIN]], align 4
+; CGSCC-NEXT: [[I:%.*]] = load i8, i8* [[GEP0]], align 16
+; CGSCC-NEXT: ret i8 [[I]]
+;
+entry:
+ %Bytes = alloca [1024 x i8], align 16
+ %gep0 = getelementptr inbounds [1024 x i8], [1024 x i8]* %Bytes, i64 0, i64 0
+ store i8 7, i8* %gep0, align 4
+ br i1 %cnd, label %left, label %right
+
+left: ; preds = %entry
+ %gep1 = getelementptr inbounds [1024 x i8], [1024 x i8]* %Bytes, i64 0, i64 %index1
+ br label %join
+
+right: ; preds = %entry
+ %gep2 = getelementptr inbounds [1024 x i8], [1024 x i8]* %Bytes, i64 0, i64 %index2
+ br label %join
+
+join: ; preds = %right, %left
+ %gep.join = phi i8* [ %gep1, %left ], [ %gep2, %right ]
+ store i8 9, i8* %gep.join, align 4
+
+ ; This load cannot be replaced by the value 7 from %entry, since the previous
+ ; store interferes due to its unknown offset.
+ %i = load i8, i8* %gep0, align 4
+ ret i8 %i
+}
+
; We could simplify these if we separate accessed bins wrt. alignment (here mod 4).
define i32 @unknown_access_mixed_simplifiable(i32 %arg1, i32 %arg2) {
; CHECK: Function Attrs: nofree norecurse nosync nounwind readnone willreturn