// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
CallLoweringInfo &CLI,
+ CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain,
- SDValue StackPtr) const {
+ SDValue Chain) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
if (!CLI.CS)
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
} else {
- SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
- InputReg,
- OutgoingArg->getStackOffset());
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
MemOpChains.push_back(ArgStore);
}
}
}
// The first 4 bytes are reserved for the callee's emergency stack slot.
- const unsigned CalleeUsableStackOffset = 4;
-
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+
+ // The first 4 bytes are reserved for the callee's emergency stack slot.
+ CCInfo.AllocateStack(4, 4);
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
}
}
- // Stack pointer relative accesses are done by changing the offset SGPR. This
- // is just the VGPR offset component.
- SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
-
SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset;
- SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
+ SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
if (IsTailCall) {
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
Offset = Offset + FPDiff;
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
- DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
- StackPtr);
+ DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
// Make sure any stack arguments overlapping with where we're storing
}
// Copy special input registers after user input arguments.
- passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
-define void @call_void_func_byval_struct_func() #0 {
+define void @call_void_func_byval_struct_func() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
; GCN: s_swappc_b64
; GCN-NOT: s_sub_u32 s32
; GCN: s_endpgm
-define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
+define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
ret void
}
+; GCN-LABEL: {{^}}void_func_byval_struct_align8:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}}
+; GCN-NOT: s32
+define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 {
+entry:
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8
+ %add = add nsw i32 %tmp, 1
+ store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8
+ %add3 = add nsw i32 %tmp1, 2
+ store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8
+ store volatile i32 9, i32 addrspace(1)* null, align 4
+ ret void
+}
+
+; Make sure the byval alignment is respected in the call frame setup
+; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_add_u32 s32, s33, 0xc00{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
+; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
+
+; GCN-NOT: s_add_u32 s32, s32, 0x800
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
+
+; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
+
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+
+
+; GCN: s_swappc_b64
+; GCN-NOT: s_sub_u32 s32
+; GCN: s_endpgm
+define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
+ call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func:
+; GCN: s_mov_b32 s5, s32
+; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
+; GCN-DAG: v_writelane_b32
+
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+
+; GCN-NOT: s_add_u32 s32, s32, 0x800
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
+
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+
+; GCN: s_swappc_b64
+; GCN-NOT: v_readlane_b32 s32
+; GCN: v_readlane_b32
+; GCN-NOT: v_readlane_b32 s32
+
+; GCN-NOT: s_sub_u32 s32, s32, 0x800
+
+; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @call_void_func_byval_struct_align8_func() #0 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
+ call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ ret void
+}
+
; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
entry:
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN: s_mov_b32 s4, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
; GCN: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
; GCN: s_swappc_b64
; GCN-NOT: s32
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
ret void
}
-; frame[0] = kernel emergency stack slot
-; frame[1] = callee emergency stack slot
-; frame[2] = ID X
-; frame[3] = ID Y
-; frame[4] = ID Z
+; frame[0] = callee emergency stack slot
+; frame[1] = ID X
+; frame[2] = ID Y
+; frame[3] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
; GCN: enable_vgpr_workitem_id = 2
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
call void @too_many_args_use_workitem_id_xyz(
ret void
}
-; frame[0] = kernel emergency stack slot
-; frame[1] = callee emergency stack slot
-; frame[2] = ID Y
-; frame[3] = ID Z
+; frame[0] = callee emergency stack slot
+; frame[1] = ID Y
+; frame[2] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
; GCN: enable_vgpr_workitem_id = 2
; GCN: s_mov_b32 s32, s33
; GCN-DAG: v_mov_b32_e32 v31, v0
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
call void @too_many_args_use_workitem_id_x_stack_yz(