const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (!SPReg)
- SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
+ if (!SPReg) {
+ const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
+ if (ST.enableFlatScratch()) {
+ // The stack is accessed unswizzled, so we can use a regular copy.
+ SPReg = MIRBuilder.buildCopy(PtrTy,
+ MFI->getStackPtrOffsetReg()).getReg(0);
+ } else {
+ // The address we produce here, without knowing the use context, is going
+ // to be interpreted as a vector address, so we need to convert to a
+ // swizzled address.
+ SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
+ {MFI->getStackPtrOffsetReg()}).getReg(0);
+ }
+ }
auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
return true;
}
+bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+ .addImm(Subtarget->getWavefrontSizeLog2())
+ .addReg(SrcReg);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+ .addReg(SrcReg)
+ .addImm(Subtarget->getWavefrontSizeLog2());
+ }
+
+ const TargetRegisterClass &RC =
+ IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
case AMDGPU::G_SI_CALL:
I.setDesc(TII.get(AMDGPU::SI_CALL));
return true;
+ case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
+ return selectWaveAddress(I);
default:
return selectImpl(I, *CoverageInfo);
}
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
MachineOperand &DataOp) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
+ bool selectWaveAddress(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
bool AllowAbs = true) const;
OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
break;
}
+ case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
+ // This case is weird because we expect a physical register in the source,
+ // but need to set a bank anyway.
+ //
+ // We could select the result to SGPR or VGPR, but for the one current use
+ // it's more practical to always use VGPR.
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
+ }
case AMDGPU::G_INSERT: {
unsigned BankID = getMappingType(MRI, MI);
unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
}
}
+ if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
+ const MachineOperand &SrcOp = MI.getOperand(1);
+ if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
+ ErrInfo = "pseudo expects only physical SGPRs";
+ return false;
+ }
+ }
+
return true;
}
let Namespace = "AMDGPU";
}
+// Convert a wave address to a swizzled vector address (i.e. this is
+// for copying the stack pointer to a vector address appropriate to
+// use in the offset field of mubuf instructions).
+def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
// Returns -1 if the input is zero.
def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s
+; RUN: llc -global-isel -amdgpu-enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s
+
+; Test end-to-end codegen for outgoing arguments passed on the
+; stack. This test is likely redundant when all DAG and GlobalISel
+; tests are unified.
+
+declare hidden void @external_void_func_v16i32_v16i32_v4i32(<16 x i32>, <16 x i32>, <4 x i32>) #0
+declare hidden void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32])) #0
+
+define amdgpu_kernel void @kernel_caller_stack() {
+; MUBUF-LABEL: kernel_caller_stack:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; MUBUF-NEXT: s_mov_b32 s32, 0
+; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; MUBUF-NEXT: v_mov_b32_e32 v1, 9
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v1, 10
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: v_mov_b32_e32 v1, 11
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: v_mov_b32_e32 v1, 12
+; MUBUF-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
+; MUBUF-NEXT: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: s_endpgm
+;
+; FLATSCR-LABEL: kernel_caller_stack:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 10
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: s_endpgm
+ call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_caller_byval() {
+; MUBUF-LABEL: kernel_caller_byval:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12
+; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16
+; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20
+; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24
+; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28
+; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32
+; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36
+; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40
+; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44
+; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48
+; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52
+; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56
+; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60
+; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
+; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
+; MUBUF-NEXT: s_movk_i32 s32, 0x1400
+; MUBUF-NEXT: v_lshrrev_b32_e64 v16, 6, s32
+; MUBUF-NEXT: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v1, v16, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v3, v16, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v6, v16, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v7, v16, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v8, v16, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: s_waitcnt vmcnt(15)
+; MUBUF-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: s_endpgm
+;
+; FLATSCR-LABEL: kernel_caller_byval:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:8
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:72
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:80
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:24
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:88
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:32
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:96
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:104
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:48
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:112
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:56
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:120
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:64
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:128
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s33 offset:8
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s33 offset:16
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s33 offset:24
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s33 offset:32
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s33 offset:40
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s33 offset:48
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s33 offset:56
+; FLATSCR-NEXT: s_mov_b32 s33, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s33 offset:64
+; FLATSCR-NEXT: s_movk_i32 s32, 0x50
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s32 offset:8
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[6:7], s32 offset:24
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[8:9], s32 offset:32
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[10:11], s32 offset:40
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[12:13], s32 offset:48
+; FLATSCR-NEXT: s_waitcnt vmcnt(7)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[14:15], s32 offset:56
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: s_endpgm
+ %alloca = alloca [16 x i32], align 4, addrspace(5)
+ %cast = bitcast [16 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
+ call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %cast, i8 0, i32 128, i1 false)
+ call void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32]) %alloca)
+ ret void
+}
+
+define void @func_caller_stack() {
+; MUBUF-LABEL: func_caller_stack:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
+; MUBUF-NEXT: s_mov_b32 s33, s32
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; MUBUF-NEXT: v_mov_b32_e32 v1, 9
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v1, 10
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: v_mov_b32_e32 v1, 11
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: v_mov_b32_e32 v1, 12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
+; MUBUF-NEXT: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
+; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: v_readlane_b32 s4, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s5, v40, 1
+; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
+; MUBUF-NEXT: v_readlane_b32 s33, v40, 2
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[4:5]
+;
+; FLATSCR-LABEL: func_caller_stack:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 10
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: v_readlane_b32 s0, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s1, v40, 1
+; FLATSCR-NEXT: s_add_i32 s32, s32, -16
+; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[0:1]
+ call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+ ret void
+}
+
+define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) {
+; MUBUF-LABEL: func_caller_byval:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
+; MUBUF-NEXT: s_mov_b32 s33, s32
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s32
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: s_nop 0
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: s_waitcnt vmcnt(1)
+; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: v_readlane_b32 s4, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s5, v40, 1
+; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
+; MUBUF-NEXT: v_readlane_b32 s33, v40, 2
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[4:5]
+;
+; FLATSCR-LABEL: func_caller_byval:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off
+; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:8
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:8
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:16
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:16
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:24
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:24
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:32
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:32
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:40
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:40
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:48
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:48
+; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], v0, off offset:56
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: v_readlane_b32 s0, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s1, v40, 1
+; FLATSCR-NEXT: s_add_i32 s32, s32, -16
+; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[0:1]
+ %cast = bitcast [16 x i32] addrspace(5)* %argptr to i8 addrspace(5)*
+ call void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32]) %argptr)
+ ret void
+}
+
+declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #1
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #1 = { argmemonly nofree nounwind willreturn writeonly }
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1031 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1031 -mattr=+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
+
+---
+name: wave_address_s
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ ; WAVE32-LABEL: name: wave_address_s
+ ; WAVE32: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_LSHR_B32_]]
+ ; WAVE64-LABEL: name: wave_address_s
+ ; WAVE64: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_LSHR_B32_]]
+ %0:sgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ S_ENDPGM 0, implicit %0
+...
+
+---
+name: wave_address_v
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ ; WAVE32-LABEL: name: wave_address_v
+ ; WAVE32: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B32_e64_]]
+ ; WAVE64-LABEL: name: wave_address_v
+ ; WAVE64: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B32_e64_]]
+ %0:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ S_ENDPGM 0, implicit %0
+...
; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; GFX900-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; GFX900-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
- ; GFX900-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; GFX900-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; GFX900-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX900-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C4]](s32)
+ ; GFX900-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; GFX900-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; GFX900-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX900-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX900-NEXT: $vgpr28 = COPY [[UV28]](s32)
; GFX900-NEXT: $vgpr29 = COPY [[UV29]](s32)
; GFX900-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; GFX900-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; GFX900-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; GFX908-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; GFX908-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
- ; GFX908-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; GFX908-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; GFX908-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX908-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C4]](s32)
+ ; GFX908-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; GFX908-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; GFX908-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX908-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX908-NEXT: $vgpr28 = COPY [[UV28]](s32)
; GFX908-NEXT: $vgpr29 = COPY [[UV29]](s32)
; GFX908-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; GFX908-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; GFX908-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; GFX900-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; GFX900-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GFX900-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
- ; GFX900-NEXT: [[COPY34:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; GFX900-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; GFX900-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY34]], [[C1]](s32)
+ ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C1]](s32)
; GFX900-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; GFX900-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX900-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX900-NEXT: $vgpr28 = COPY [[UV28]](s32)
; GFX900-NEXT: $vgpr29 = COPY [[UV29]](s32)
; GFX900-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; GFX900-NEXT: [[COPY35:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY35]](<4 x s32>)
+ ; GFX900-NEXT: [[COPY34:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY34]](<4 x s32>)
; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY26]](p4)
; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY27]](p4)
; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[COPY28]](p4)
; GFX900-NEXT: $vgpr31 = COPY [[COPY33]](s32)
; GFX900-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; GFX900-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc
- ; GFX900-NEXT: [[COPY36:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY25]]
- ; GFX900-NEXT: S_SETPC_B64_return [[COPY36]]
+ ; GFX900-NEXT: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY25]]
+ ; GFX900-NEXT: S_SETPC_B64_return [[COPY35]]
; GFX908-LABEL: name: test_func_call_external_void_func_v32i32
; GFX908: bb.1 (%ir-block.1):
; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
; GFX908-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; GFX908-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GFX908-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
- ; GFX908-NEXT: [[COPY34:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; GFX908-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; GFX908-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY34]], [[C1]](s32)
+ ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C1]](s32)
; GFX908-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; GFX908-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX908-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX908-NEXT: $vgpr28 = COPY [[UV28]](s32)
; GFX908-NEXT: $vgpr29 = COPY [[UV29]](s32)
; GFX908-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; GFX908-NEXT: [[COPY35:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY35]](<4 x s32>)
+ ; GFX908-NEXT: [[COPY34:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY34]](<4 x s32>)
; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY26]](p4)
; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY27]](p4)
; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[COPY28]](p4)
; GFX908-NEXT: $vgpr31 = COPY [[COPY33]](s32)
; GFX908-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; GFX908-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc
- ; GFX908-NEXT: [[COPY36:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY25]]
- ; GFX908-NEXT: S_SETPC_B64_return [[COPY36]]
+ ; GFX908-NEXT: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY25]]
+ ; GFX908-NEXT: S_SETPC_B64_return [[COPY35]]
call void @external_void_func_v32i32(<32 x i32> zeroinitializer)
ret void
}
; GCN-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C5]](s32)
; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; GCN-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; GCN-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C6]](s32)
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
; GCN-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; GCN-NEXT: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.in.val, align 4, addrspace 5)
; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX1]](p5)
- ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4)
; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<64 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[DEF1]](s16)
; CHECK-NEXT: [[UV63:%[0-9]+]]:_(<2 x s16>), [[UV64:%[0-9]+]]:_(<2 x s16>), [[UV65:%[0-9]+]]:_(<2 x s16>), [[UV66:%[0-9]+]]:_(<2 x s16>), [[UV67:%[0-9]+]]:_(<2 x s16>), [[UV68:%[0-9]+]]:_(<2 x s16>), [[UV69:%[0-9]+]]:_(<2 x s16>), [[UV70:%[0-9]+]]:_(<2 x s16>), [[UV71:%[0-9]+]]:_(<2 x s16>), [[UV72:%[0-9]+]]:_(<2 x s16>), [[UV73:%[0-9]+]]:_(<2 x s16>), [[UV74:%[0-9]+]]:_(<2 x s16>), [[UV75:%[0-9]+]]:_(<2 x s16>), [[UV76:%[0-9]+]]:_(<2 x s16>), [[UV77:%[0-9]+]]:_(<2 x s16>), [[UV78:%[0-9]+]]:_(<2 x s16>), [[UV79:%[0-9]+]]:_(<2 x s16>), [[UV80:%[0-9]+]]:_(<2 x s16>), [[UV81:%[0-9]+]]:_(<2 x s16>), [[UV82:%[0-9]+]]:_(<2 x s16>), [[UV83:%[0-9]+]]:_(<2 x s16>), [[UV84:%[0-9]+]]:_(<2 x s16>), [[UV85:%[0-9]+]]:_(<2 x s16>), [[UV86:%[0-9]+]]:_(<2 x s16>), [[UV87:%[0-9]+]]:_(<2 x s16>), [[UV88:%[0-9]+]]:_(<2 x s16>), [[UV89:%[0-9]+]]:_(<2 x s16>), [[UV90:%[0-9]+]]:_(<2 x s16>), [[UV91:%[0-9]+]]:_(<2 x s16>), [[UV92:%[0-9]+]]:_(<2 x s16>), [[UV93:%[0-9]+]]:_(<2 x s16>), [[UV94:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<64 x s16>)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV94]](<2 x s16>), [[PTR_ADD1]](p5) :: (store (<2 x s16>) into stack, align 16, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV63]](<2 x s16>)
; CHECK-NEXT: $vgpr1 = COPY [[UV64]](<2 x s16>)
; CHECK-NEXT: $vgpr28 = COPY [[UV91]](<2 x s16>)
; CHECK-NEXT: $vgpr29 = COPY [[UV92]](<2 x s16>)
; CHECK-NEXT: $vgpr30 = COPY [[UV93]](<2 x s16>)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<66 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[UV63]](s16), [[UV64]](s16), [[DEF1]](s16)
; CHECK-NEXT: [[UV65:%[0-9]+]]:_(<2 x s16>), [[UV66:%[0-9]+]]:_(<2 x s16>), [[UV67:%[0-9]+]]:_(<2 x s16>), [[UV68:%[0-9]+]]:_(<2 x s16>), [[UV69:%[0-9]+]]:_(<2 x s16>), [[UV70:%[0-9]+]]:_(<2 x s16>), [[UV71:%[0-9]+]]:_(<2 x s16>), [[UV72:%[0-9]+]]:_(<2 x s16>), [[UV73:%[0-9]+]]:_(<2 x s16>), [[UV74:%[0-9]+]]:_(<2 x s16>), [[UV75:%[0-9]+]]:_(<2 x s16>), [[UV76:%[0-9]+]]:_(<2 x s16>), [[UV77:%[0-9]+]]:_(<2 x s16>), [[UV78:%[0-9]+]]:_(<2 x s16>), [[UV79:%[0-9]+]]:_(<2 x s16>), [[UV80:%[0-9]+]]:_(<2 x s16>), [[UV81:%[0-9]+]]:_(<2 x s16>), [[UV82:%[0-9]+]]:_(<2 x s16>), [[UV83:%[0-9]+]]:_(<2 x s16>), [[UV84:%[0-9]+]]:_(<2 x s16>), [[UV85:%[0-9]+]]:_(<2 x s16>), [[UV86:%[0-9]+]]:_(<2 x s16>), [[UV87:%[0-9]+]]:_(<2 x s16>), [[UV88:%[0-9]+]]:_(<2 x s16>), [[UV89:%[0-9]+]]:_(<2 x s16>), [[UV90:%[0-9]+]]:_(<2 x s16>), [[UV91:%[0-9]+]]:_(<2 x s16>), [[UV92:%[0-9]+]]:_(<2 x s16>), [[UV93:%[0-9]+]]:_(<2 x s16>), [[UV94:%[0-9]+]]:_(<2 x s16>), [[UV95:%[0-9]+]]:_(<2 x s16>), [[UV96:%[0-9]+]]:_(<2 x s16>), [[UV97:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<66 x s16>)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV96]](<2 x s16>), [[PTR_ADD1]](p5) :: (store (<2 x s16>) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C4]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[UV97]](<2 x s16>), [[PTR_ADD2]](p5) :: (store (<2 x s16>) into stack + 4, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV65]](<2 x s16>)
; CHECK-NEXT: $vgpr1 = COPY [[UV66]](<2 x s16>)
; CHECK-NEXT: $vgpr28 = COPY [[UV93]](<2 x s16>)
; CHECK-NEXT: $vgpr29 = COPY [[UV94]](<2 x s16>)
; CHECK-NEXT: $vgpr30 = COPY [[UV95]](<2 x s16>)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>), [[UV32:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<66 x s16>)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV31]](<2 x s16>), [[PTR_ADD1]](p5) :: (store (<2 x s16>) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C4]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[UV32]](<2 x s16>), [[PTR_ADD2]](p5) :: (store (<2 x s16>) into stack + 4, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](<2 x s16>)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](<2 x s16>)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](<2 x s16>)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C4]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[LOAD2]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 4, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD2]](s8)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C4]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[ANYEXT]](s16), [[PTR_ADD2]](p5) :: (store (s16) into stack + 4, align 4, addrspace 5)
- ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16)
+ ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[COPY22]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
+ ; CHECK-NEXT: G_STORE [[COPY21]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C6]](s32)
+ ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
; CHECK-NEXT: G_STORE [[LOAD3]](s16), [[PTR_ADD4]](p5) :: (store (s16) into stack + 12, align 4, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY23]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C4]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C5]](s32)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C6]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.val, align 4, addrspace 5)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C1]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store (s96) into stack, align 4, addrspace 5), (dereferenceable load (s96) from %ir.incoming0, align 4, addrspace 5)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store (s8) into stack + 32, align 32, addrspace 5), (dereferenceable load (s8) from %ir.incoming1, align 32, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[C]](s32)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY13]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @void_func_byval_a3i32_byval_i8_align32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 36, implicit-def $scc
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
- ; CHECK-NEXT: S_SETPC_B64_return [[COPY21]]
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+ ; CHECK-NEXT: S_SETPC_B64_return [[COPY20]]
call void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byval([3 x i32]) %incoming0, i8 addrspace(5)* align 32 %incoming1, i32 999)
ret void
}
; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY18]], [[C]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
; CHECK-NEXT: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C1]](s32), 0 :: (dereferenceable store (s256) into stack, align 4, addrspace 5), (dereferenceable load (s256) from %ir.incoming_high_align, align 256, addrspace 5)
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY12]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @void_func_byval_a4i64_align4, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 32, implicit-def $scc
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
- ; CHECK-NEXT: S_SETPC_B64_return [[COPY20]]
+ ; CHECK-NEXT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+ ; CHECK-NEXT: S_SETPC_B64_return [[COPY19]]
call void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i64]) align 4 %incoming_high_align)
ret void
}
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<32 x s32>)
- ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C4]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](s64)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C5]](s32)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
; CHECK-NEXT: G_STORE [[UV32]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C6]](s32)
+ ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
; CHECK-NEXT: G_STORE [[UV33]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
; CHECK-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR8]](<3 x s32>)
; CHECK-NEXT: [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR9]](<3 x s32>)
; CHECK-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR10]](<3 x s32>)
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C16]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C17]](s32)
; CHECK-NEXT: G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
; CHECK-NEXT: [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR11]](<3 x s32>)
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C18]](s32)
; CHECK-NEXT: G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C19]](s32)
; CHECK-NEXT: G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
+ ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C20]](s32)
; CHECK-NEXT: G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY11]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_12xv3i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 20, implicit-def $scc
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
- ; CHECK-NEXT: S_SETPC_B64_return [[COPY19]]
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+ ; CHECK-NEXT: S_SETPC_B64_return [[COPY18]]
entry:
call void @external_void_func_12xv3i32(
<3 x i32> <i32 0, i32 0, i32 0>,
; CHECK-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR8]](<3 x s32>)
; CHECK-NEXT: [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR9]](<3 x s32>)
; CHECK-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR10]](<3 x s32>)
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C16]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C17]](s32)
; CHECK-NEXT: G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
; CHECK-NEXT: [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR11]](<3 x s32>)
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C18]](s32)
; CHECK-NEXT: G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C19]](s32)
; CHECK-NEXT: G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
+ ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C20]](s32)
; CHECK-NEXT: G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY11]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_12xv3f32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 20, implicit-def $scc
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
- ; CHECK-NEXT: S_SETPC_B64_return [[COPY19]]
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+ ; CHECK-NEXT: S_SETPC_B64_return [[COPY18]]
entry:
call void @external_void_func_12xv3f32(
<3 x float> <float 0.0, float 0.0, float 0.0>,
; CHECK-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR4]](<5 x s32>)
; CHECK-NEXT: [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR5]](<5 x s32>)
; CHECK-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR6]](<5 x s32>)
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C16]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C17]](s32)
; CHECK-NEXT: G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C18]](s32)
; CHECK-NEXT: G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C19]](s32)
; CHECK-NEXT: G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
; CHECK-NEXT: [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR7]](<5 x s32>)
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
+ ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C20]](s32)
; CHECK-NEXT: G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
- ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C21]](s32)
+ ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C21]](s32)
; CHECK-NEXT: G_STORE [[UV36]](s32), [[PTR_ADD5]](p5) :: (store (s32) into stack + 20, addrspace 5)
; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
- ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C22]](s32)
+ ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C22]](s32)
; CHECK-NEXT: G_STORE [[UV37]](s32), [[PTR_ADD6]](p5) :: (store (s32) into stack + 24, align 8, addrspace 5)
; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
- ; CHECK-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C23]](s32)
+ ; CHECK-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C23]](s32)
; CHECK-NEXT: G_STORE [[UV38]](s32), [[PTR_ADD7]](p5) :: (store (s32) into stack + 28, addrspace 5)
; CHECK-NEXT: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; CHECK-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C24]](s32)
+ ; CHECK-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C24]](s32)
; CHECK-NEXT: G_STORE [[UV39]](s32), [[PTR_ADD8]](p5) :: (store (s32) into stack + 32, align 16, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY11]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_8xv5i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 36, implicit-def $scc
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
- ; CHECK-NEXT: S_SETPC_B64_return [[COPY19]]
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+ ; CHECK-NEXT: S_SETPC_B64_return [[COPY18]]
entry:
call void @external_void_func_8xv5i32(
<5 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0>,
; CHECK-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR4]](<5 x s32>)
; CHECK-NEXT: [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR5]](<5 x s32>)
; CHECK-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR6]](<5 x s32>)
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C16]](s32)
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C17]](s32)
; CHECK-NEXT: G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C18]](s32)
; CHECK-NEXT: G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
+ ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C19]](s32)
; CHECK-NEXT: G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
; CHECK-NEXT: [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR7]](<5 x s32>)
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
+ ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C20]](s32)
; CHECK-NEXT: G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
- ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C21]](s32)
+ ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C21]](s32)
; CHECK-NEXT: G_STORE [[UV36]](s32), [[PTR_ADD5]](p5) :: (store (s32) into stack + 20, addrspace 5)
; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
- ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C22]](s32)
+ ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C22]](s32)
; CHECK-NEXT: G_STORE [[UV37]](s32), [[PTR_ADD6]](p5) :: (store (s32) into stack + 24, align 8, addrspace 5)
; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
- ; CHECK-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C23]](s32)
+ ; CHECK-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C23]](s32)
; CHECK-NEXT: G_STORE [[UV38]](s32), [[PTR_ADD7]](p5) :: (store (s32) into stack + 28, addrspace 5)
; CHECK-NEXT: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; CHECK-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C24]](s32)
+ ; CHECK-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C24]](s32)
; CHECK-NEXT: G_STORE [[UV39]](s32), [[PTR_ADD8]](p5) :: (store (s32) into stack + 32, align 16, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32)
; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32)
; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY11]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_8xv5f32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 36, implicit-def $scc
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
- ; CHECK-NEXT: S_SETPC_B64_return [[COPY19]]
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+ ; CHECK-NEXT: S_SETPC_B64_return [[COPY18]]
entry:
call void @external_void_func_8xv5f32(
<5 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_byval_i32
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; GCN-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY4]], [[C]](s32)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C]](s32)
; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; GCN-NEXT: G_MEMCPY [[PTR_ADD]](p5), [[COPY1]](p5), [[C1]](s32), 0 :: (dereferenceable store (s32) into stack, addrspace 5), (dereferenceable load (s32) from %ir.b.byval, addrspace 5)
; GCN-NEXT: $vgpr0 = COPY [[COPY]](s32)
- ; GCN-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY5]](<4 x s32>)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @i32_fastcc_i32_byval_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc
- ; GCN-NEXT: $vgpr0 = COPY [[COPY6]](s32)
- ; GCN-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
- ; GCN-NEXT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY5]](s32)
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+ ; GCN-NEXT: S_SETPC_B64_return [[COPY6]], implicit $vgpr0
entry:
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval)
ret i32 %ret
; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(p5) = COPY $sgpr32
+ ; GCN-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C1]](s32)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C1]](s32)
; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C2]](s32)
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C2]](s32)
; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C3]](s32)
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C3]](s32)
; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
; GCN-NEXT: $vgpr0 = COPY [[COPY]](s32)
; GCN-NEXT: $vgpr1 = COPY [[COPY1]](s32)
; GCN-NEXT: $vgpr28 = COPY [[C]](s32)
; GCN-NEXT: $vgpr29 = COPY [[C]](s32)
; GCN-NEXT: $vgpr30 = COPY [[C]](s32)
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @i32_fastcc_i32_i32_a32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: ADJCALLSTACKDOWN 0, 12, implicit-def $scc
- ; GCN-NEXT: $vgpr0 = COPY [[COPY5]](s32)
- ; GCN-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
- ; GCN-NEXT: S_SETPC_B64_return [[COPY6]], implicit $vgpr0
+ ; GCN-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+ ; GCN-NEXT: S_SETPC_B64_return [[COPY5]], implicit $vgpr0
entry:
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
ret i32 %ret
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s
+
+# TODO: We could use scalar
+---
+name: amdgpu_wave_address
+legalized: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: amdgpu_wave_address
+ ; CHECK: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_WAVE_ADDRESS]](p5)
+ %0:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ S_ENDPGM 0, implicit %0
+...
+
+# TODO: Should infer v here
+---
+name: amdgpu_wave_address_v
+legalized: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: amdgpu_wave_address_v
+ ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; CHECK-NEXT: G_STORE [[AMDGPU_WAVE_ADDRESS]](p5), [[COPY]](p1) :: (store (p5), addrspace 1)
+ %0:_(p1) = G_IMPLICIT_DEF
+ %1:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ G_STORE %1, %0 :: (store (p5), addrspace 1)
+...