};
}
-AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
- const GCNTargetMachine &TM) {
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
+ const GCNTargetMachine &TM)
+ : ST(ST_) {
using namespace TargetOpcode;
auto GetAddrSpacePtr = [&TM](unsigned AS) {
[](const LegalityQuery &Query) {
return std::make_pair(0, LLT::scalar(32));
})
- .fewerElementsIf([=, &ST](const LegalityQuery &Query) {
+ .fewerElementsIf([=](const LegalityQuery &Query) {
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
return (MemSize == 96) &&
Query.Types[0].isVector() &&
[=](const LegalityQuery &Query) {
return std::make_pair(0, V2S32);
})
- .legalIf([=, &ST](const LegalityQuery &Query) {
+ .legalIf([=](const LegalityQuery &Query) {
const LLT &Ty0 = Query.Types[0];
unsigned Size = Ty0.getSizeInBits();
return false;
}
+bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (!MFI->isEntryFunction()) {
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ }
+
+ B.setInstr(MI);
+
+ uint64_t Offset =
+ ST.getTargetLowering()->getImplicitParameterOffset(
+ B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
+
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *RC;
+ std::tie(Arg, RC)
+ = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ if (!Arg)
+ return false;
+
+ Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
+ if (!loadInputValue(KernargPtrReg, B, Arg))
+ return false;
+
+ B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
return false;
}
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ return legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return legalizeImplicitArgPtr(MI, MRI, B);
case Intrinsic::amdgcn_workitem_id_x:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
--- /dev/null
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
+
+; ALL-LABEL: {{^}}test:
+; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: kernarg_segment_byte_size = 8
+; HSA: kernarg_segment_alignment = 4
+
+; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
+
+; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
+ %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+ %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
+ %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
+ %value = load i32, i32 addrspace(4)* %gep
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_implicit:
+; HSA: kernarg_segment_byte_size = 8
+; OS-MESA3D: kernarg_segment_byte_size = 24
+; CO-V2: kernarg_segment_alignment = 4
+
+; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
+
+; OS-UNKNOWN: s_add_u32 s[[LO:[0-9]+]], s0, 44
+; OS-UNKNOWN-NEXT: s_addc_u32 s[[HI:[0-9]+]], s1, 0
+; OS-UNKNOWN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO]]:[[HI]]{{\]}}, 0xa
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
+ %value = load i32, i32 addrspace(4)* %gep
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_implicit_alignment:
+; HSA: kernarg_segment_byte_size = 12
+; OS-MESA3D: kernarg_segment_byte_size = 28
+; CO-V2: kernarg_segment_alignment = 4
+
+
+; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc
+; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4
+; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3
+; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
+; ALL: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
+define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val = load i32, i32 addrspace(4)* %arg.ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}opencl_test_implicit_alignment
+; HSA: kernarg_segment_byte_size = 64
+; OS-MESA3D: kernarg_segment_byte_size = 28
+; CO-V2: kernarg_segment_alignment = 4
+
+
+; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc
+; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4
+; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3
+; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
+; ALL: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
+define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #2 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val = load i32, i32 addrspace(4)* %arg.ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_no_kernargs:
+; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: kernarg_segment_byte_size = 0
+; OS-MESA3D: kernarg_segment_byte_size = 16
+; CO-V2: kernarg_segment_alignment = 4
+
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
+define amdgpu_kernel void @test_no_kernargs() #1 {
+ %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+ %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
+ %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
+ %value = load i32, i32 addrspace(4)* %gep
+ store volatile i32 %value, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs:
+; HSA: kernarg_segment_byte_size = 48
+; OS-MESA3d: kernarg_segment_byte_size = 16
+; CO-V2: kernarg_segment_alignment = 4
+define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val = load volatile i32, i32 addrspace(4)* %arg.ptr
+ store volatile i32 %val, i32 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs_round_up:
+; HSA: kernarg_segment_byte_size = 40
+; OS-MESA3D: kernarg_segment_byte_size = 16
+; CO-V2: kernarg_segment_alignment = 4
+define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_round_up() #3 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val = load volatile i32, i32 addrspace(4)* %arg.ptr
+ store volatile i32 %val, i32 addrspace(1)* null
+ ret void
+}
+
+declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" }
+attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" }