int *x;
float *y;
};
-// `by-val` struct will be coerced into a similar struct with all generic
-// pointers lowerd into global ones.
+// `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect
+// by-val). However, the enhanced address inferring pass should be able to
+// assume they are global pointers.
+//
// HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1)
// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S addrspace(4)*{{.*}} byref(%struct.S) align 8 %0)
// OPT: [[R0:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
// OPT: [[P0:%.*]] = load i32*, i32* addrspace(4)* [[R0]], align 8
+// OPT: [[G0:%.*]] = addrspacecast i32* [[P0]] to i32 addrspace(1)*
// OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
-// OPT: [[V0:%.*]] = load i32, i32* [[P0]], align 4
+// OPT: [[G1:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
+// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4
// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
-// OPT: store i32 [[INC]], i32* [[P0]], align 4
-// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4
+// OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4
+// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
// OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
-// OPT: store float [[ADD]], float* [[P1]], align 4
+// OPT: store float [[ADD]], float addrspace(1)* [[G1]], align 4
// OPT: ret void
__global__ void kernel4(struct S s) {
s.x[0]++;
struct T {
float *x[2];
};
-// `by-val` array is also coerced.
+// `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect
+// by-val). However, the enhanced address inferring pass should be able to
+// assume they are global pointers.
+//
// HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1)
// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T addrspace(4)*{{.*}} byref(%struct.T) align 8 %0)
// OPT: [[R0:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 0
// OPT: [[P0:%.*]] = load float*, float* addrspace(4)* [[R0]], align 8
+// OPT: [[G0:%.*]] = addrspacecast float* [[P0]] to float addrspace(1)*
// OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1
// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
-// OPT: [[V0:%.*]] = load float, float* [[P0]], align 4
+// OPT: [[G1:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
+// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4
// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
-// OPT: store float [[ADD0]], float* [[P0]], align 4
-// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4
+// OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4
+// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
// OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
-// OPT: store float [[ADD1]], float* [[P1]], align 4
+// OPT: store float [[ADD1]], float addrspace(1)* [[G1]], align 4
// OPT: ret void
__global__ void kernel6(struct T t) {
t.x[0][0] += 1.f;
Using the constant address space indicates that the data will not change
during the execution of the kernel. This allows scalar read instructions to
- be used. The vector and scalar L1 caches are invalidated of volatile data
- before each kernel dispatch execution to allow constant memory to change
- values between kernel dispatches.
+ be used. As the constant address space could only be modified on the host
+ side, a generic pointer loaded from the constant address space is safe to be
+ assumed as a global pointer since only the device global memory is visible
+ and managed on the host side. The vector and scalar L1 caches are invalidated
+ of volatile data before each kernel dispatch execution to allow constant
+ memory to change values between kernel dispatches.
**Region**
The region address space uses the hardware Global Data Store (GDS). All
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
+ unsigned getAssumedAddrSpace(const Value *V) const;
+
/// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
/// NewV, which has a different address space. This should happen for every
/// operand index that collectFlatAddressOperands returned for the intrinsic.
virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const = 0;
virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
+ virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *OldV,
Value *NewV) const = 0;
return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
}
+ unsigned getAssumedAddrSpace(const Value *V) const override {
+ return Impl.getAssumedAddrSpace(V);
+ }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override {
return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; }
+ unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;
return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
}
+ unsigned getAssumedAddrSpace(const Value *V) const {
+ return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
+ }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;
return false;
}
+ /// If the specified generic pointer could be assumed as a pointer to a
+ /// specific address space, return that address space.
+ ///
+ /// Under offloading programming, the offloading target may be passed with
+ /// values only prepared on the host side and could assume certain
+ /// properties.
+ virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
+
/// Get a \c TargetIRAnalysis appropriate for the target.
///
/// This is used to construct the new pass manager's target IR analysis pass,
return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
}
+unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const {
+ return TTIImpl->getAssumedAddrSpace(V);
+}
+
Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
IntrinsicInst *II, Value *OldV, Value *NewV) const {
return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
AMDGPU::isFlatGlobalAddrSpace(DestAS);
}
+unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ const auto *LD = dyn_cast<LoadInst>(V);
+ if (!LD)
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+ // It must be a generic pointer loaded.
+ assert(V->getType()->isPointerTy() &&
+ V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
+
+ const auto *Ptr = LD->getPointerOperand();
+ if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+ // For a generic pointer loaded from the constant memory, it could be assumed
+ // as a global pointer since the constant memory is only populated on the
+ // host side. As implied by the offload programming model, only global
+ // pointers could be referenced on the host side.
+ return AMDGPUAS::GLOBAL_ADDRESS;
+}
+
TargetTransformInfo
R600TargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(R600TTIImpl(this, F));
}
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+ unsigned getAssumedAddrSpace(const Value *V) const override;
};
//===----------------------------------------------------------------------===//
case Instruction::IntToPtr:
return isNoopPtrIntCastPair(Op, DL, TTI);
default:
- return false;
+ // That value is an address expression if it has an assumed address space.
+ return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
}
}
return;
}
- if (isAddressExpression(*V, *DL, TTI) &&
- V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+ if (V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
+ isAddressExpression(*V, *DL, TTI)) {
if (Visited.insert(V).second) {
PostorderStack.emplace_back(V, false);
}
// Otherwise, adds its operands to the stack and explores them.
PostorderStack.back().setInt(true);
- for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
- appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
- Visited);
+ // Skip values with an assumed address space.
+ if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) {
+ for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
+ appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+ Visited);
+ }
}
}
return Postorder;
return nullptr;
}
+ unsigned AS = TTI->getAssumedAddrSpace(I);
+ if (AS != UninitializedAddressSpace) {
+ // For the assumed address space, insert an `addrspacecast` to make that
+ // explicit.
+ auto *NewPtrTy = I->getType()->getPointerElementType()->getPointerTo(AS);
+ auto *NewI = new AddrSpaceCastInst(I, NewPtrTy);
+ NewI->insertAfter(I);
+ return NewI;
+ }
+
// Computes the converted pointer operands.
SmallVector<Value *, 4> NewPointerOperands;
for (const Use &OperandUse : I->operands()) {
const ValueToValueMapTy &ValueWithNewAddrSpace,
SmallVectorImpl<const Use *> *UndefUsesToFix) const {
// All values in Postorder are flat address expressions.
- assert(isAddressExpression(*V, *DL, TTI) &&
- V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+ assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
+ isAddressExpression(*V, *DL, TTI));
if (Instruction *I = dyn_cast<Instruction>(V)) {
Value *NewV = cloneInstructionWithNewAddressSpace(
else
NewAS = joinAddressSpaces(Src0AS, Src1AS);
} else {
- for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
- auto I = InferredAddrSpace.find(PtrOperand);
- unsigned OperandAS = I != InferredAddrSpace.end() ?
- I->second : PtrOperand->getType()->getPointerAddressSpace();
-
- // join(flat, *) = flat. So we can break if NewAS is already flat.
- NewAS = joinAddressSpaces(NewAS, OperandAS);
- if (NewAS == FlatAddrSpace)
- break;
+ unsigned AS = TTI->getAssumedAddrSpace(&V);
+ if (AS != UninitializedAddressSpace) {
+ // Use the assumed address space directly.
+ NewAS = AS;
+ } else {
+ // Otherwise, infer the address space from its pointer operands.
+ for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
+ auto I = InferredAddrSpace.find(PtrOperand);
+ unsigned OperandAS =
+ I != InferredAddrSpace.end()
+ ? I->second
+ : PtrOperand->getType()->getPointerAddressSpace();
+
+ // join(flat, *) = flat. So we can break if NewAS is already flat.
+ NewAS = joinAddressSpaces(NewAS, OperandAS);
+ if (NewAS == FlatAddrSpace)
+ break;
+ }
}
}
}
User *CurUser = U.getUser();
+ // Skip if the current user is the new value itself.
+ if (CurUser == NewV)
+ continue;
// Handle more complex cases like intrinsic that need to be remangled.
if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
; CHECK-NEXT: s_cselect_b32 s4, 1, 0
; CHECK-NEXT: s_and_b32 s4, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cbranch_scc1 BB4_6
+; CHECK-NEXT: s_cbranch_scc1 BB4_4
; CHECK-NEXT: ; %bb.1: ; %bb2
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b32 s4, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
-; CHECK-NEXT: v_mov_b32_e32 v1, s7
-; CHECK-NEXT: flat_load_dword v0, v[0:1]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0
-; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; CHECK-NEXT: global_load_dword v0, v0, s[6:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
+; CHECK-NEXT: s_cbranch_vccnz BB4_3
; CHECK-NEXT: ; %bb.2: ; %bb7
; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: ; %bb.3: ; %bb8
-; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s4, 0
-; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
-; CHECK-NEXT: s_cbranch_execz BB4_5
-; CHECK-NEXT: ; %bb.4: ; %bb11
+; CHECK-NEXT: BB4_3: ; %bb8
+; CHECK-NEXT: s_cmp_lg_u32 s4, 0
+; CHECK-NEXT: s_cselect_b32 s4, 1, 0
+; CHECK-NEXT: s_and_b32 s4, s4, 1
+; CHECK-NEXT: s_cmp_lg_u32 s4, 0
+; CHECK-NEXT: s_cbranch_scc0 BB4_5
+; CHECK-NEXT: BB4_4: ; %bb12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: BB4_5: ; %bb11
; CHECK-NEXT: v_mov_b32_e32 v0, 4.0
; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
-; CHECK-NEXT: BB4_5: ; %Flow
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: BB4_6: ; %bb12
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces -o - %s | FileCheck %s
+
+@c0 = addrspace(4) global float* undef
+
+; CHECK-LABEL: @generic_ptr_from_constant
+; CHECK: addrspacecast float* %p to float addrspace(1)*
+; CHECK-NEXT: load float, float addrspace(1)*
+define float @generic_ptr_from_constant() {
+ %p = load float*, float* addrspace(4)* @c0
+ %v = load float, float* %p
+ ret float %v
+}
+
+%struct.S = type { i32*, float* }
+
+; CHECK-LABEL: @generic_ptr_from_aggregate_argument
+; CHECK: addrspacecast i32* %p0 to i32 addrspace(1)*
+; CHECK: addrspacecast float* %p1 to float addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store float %v1, float addrspace(1)*
+; CHECK: ret
+define amdgpu_kernel void @generic_ptr_from_aggregate_argument(%struct.S addrspace(4)* byref(%struct.S) align 8 %0) {
+ %f0 = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
+ %p0 = load i32*, i32* addrspace(4)* %f0
+ %f1 = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
+ %p1 = load float*, float* addrspace(4)* %f1
+ %v0 = load i32, i32* %p0
+ %v1 = sitofp i32 %v0 to float
+ store float %v1, float* %p1
+ ret void
+}