using namespace llvm;
namespace {
+static constexpr StringLiteral ImplicitAttrNames[] = {
+ // X ids unnecessarily propagated to kernels.
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-implicitarg-ptr"};
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
- // X ids unnecessarily propagated to kernels.
- static constexpr StringLiteral AttrNames[] = {
- "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
- "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
- "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
- "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
- "amdgpu-implicitarg-ptr"};
-
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
- for (StringRef AttrName : AttrNames)
+ for (StringRef AttrName : ImplicitAttrNames)
handleAttr(Parent, Callee, AttrName);
}
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
+ bool HasIndirectCall = false;
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+ CallingConv::ID CC = F.getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+
+ // If this function hasAddressTaken() = true
+ // then add all attributes corresponding to the implicit args.
+ if (CallingConvSupportsAllImplicits &&
+ F.hasAddressTaken(nullptr, true, true, true)) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ F.addFnAttr(AttrName);
+ }
+ Changed = true;
+ }
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
const Function *Callee =
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
- // TODO: Do something with indirect calls.
+ // Note the occurence of indirect call.
if (!Callee) {
- if (!CB->isInlineAsm())
+ if (!CB->isInlineAsm()) {
+ HasIndirectCall = true;
HaveCall = true;
+ }
continue;
}
Changed = true;
}
+ // This pass cannot copy attributes from callees to callers
+ // if there is an indirect call and in thus such cases,
+ // hasAddressTaken() would be false for kernels and functions
+ // making an indirect call (if they are themselves not indirectly called).
+ // We must tag all such kernels/functions with all implicits attributes
+ // for correctness.
+ // e.g.
+ // 1. Kernel K1 makes an indirect call to function F1.
+ // Without detecting an indirect call in K1, this pass will not
+ // add all implicit args to K1 (which is incorrect).
+ // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
+ // F2.
+ // Without detecting an indirect call in F1 (whose hasAddressTaken() is
+ // false), the pass will not add all implicit args to F1 (which is
+ // essential for correctness).
+ if (CallingConvSupportsAllImplicits && HasIndirectCall) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ F.addFnAttr(AttrName);
+ }
+ Changed = true;
+ }
+
return Changed;
}
else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
}
- // Set -fixed-function-abi to true if not provided..
- if (TT.getOS() == Triple::AMDHSA &&
- EnableAMDGPUFixedFunctionABIOpt.getNumOccurrences() == 0)
- EnableFixedFunctionABI = true;
}
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
- ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
+ ; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]]
; CHECK: $vgpr0 = COPY [[ADD]](s32)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
+ ; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: $vgpr0 = COPY [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
+ ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: $vgpr0 = COPY [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
- ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
+ ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]]
; CHECK: $vgpr0 = COPY [[FADD]](s32)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 2883594 /* regdef:VReg_64 */, def %9
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
- ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %9
+ ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
; CHECK: $vgpr0 = COPY [[UV]](s32)
; CHECK: $vgpr1 = COPY [[UV1]](s32)
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
- ; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
- ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %9
+ ; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %2
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 196622 /* mem:m */, [[COPY]](p3)
- ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
+ ; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
; CHECK: $vgpr0 = COPY [[COPY2]](s32)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
- ; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
- ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %11
+ ; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %4
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
- ; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %10
- ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %10
+ ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+ ; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %3
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32)
- ; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %12, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
- ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %12
+ ; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
+ ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %5
; CHECK: $vgpr0 = COPY [[COPY5]](s32)
; CHECK: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY6]], implicit $vgpr0
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
- ; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
- ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %11
- ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %12
- ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %13
+ ; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
+ ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %4
+ ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %5
+ ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %6
; CHECK: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
+ ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
- ; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
- ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %10
+ ; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %3
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: ds_write_b32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
-; GFX8-NEXT: flat_store_dword v[0:1], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dword v[0:1], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: func_use_lds_global_constexpr_cast:
; ALL-LABEL: {{^}}test_workitem_id_x_func:
; ALL: s_waitcnt
-; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v2
; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v2
define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.x()
}
; ALL-LABEL: {{^}}test_workitem_id_y_func:
-; HSA: v_lshrrev_b32_e32 v2, 10, v31
+; HSA: v_lshrrev_b32_e32 v2, 10, v2
; MESA: v_lshrrev_b32_e32 v2, 10, v2
define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.y()
}
; ALL-LABEL: {{^}}test_workitem_id_z_func:
-; HSA: v_lshrrev_b32_e32 v2, 20, v31
+; HSA: v_lshrrev_b32_e32 v2, 20, v2
; MESA: v_lshrrev_b32_e32 v2, 20, v2
define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.z()
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: global_store_dword v[0:1], v2, off
; GCN-NEXT: s_add_u32 s6, s32, 0x1000
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_mov_b32_e32 v4, s6
-; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v2, 1
-; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: global_store_dword v[0:1], v2, off
; Test handling inside a non-kernel
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_accum_offset 32
-; GCN: NumVgprs: 32
+; GFX90A: .amdhsa_accum_offset 12
+; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
+; GFX90A: TotalNumVgprs: 44
; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
+; GFX90A: VGPRBlocks: 5
; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
+; GFX90A: NumVGPRsForWavesPerEU: 44
+; GFX90A: AccumOffset: 12
; GCN: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
bb:
call void @func_32_agprs() #0
}
; GCN-LABEL: {{^}}func_call_func_32_agprs:
-; GCN: NumVgprs: 32
+; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
+; GFX90A: TotalNumVgprs: 44
define void @func_call_func_32_agprs() #0 {
bb:
call void @func_32_agprs() #0
declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 56
-; GFX90A: .amdhsa_accum_offset 32
-; GCN: NumVgprs: 32
+; GFX908: .amdhsa_next_free_vgpr 24
+; GFX90A: .amdhsa_next_free_vgpr 48
+; GFX90A: .amdhsa_accum_offset 24
+; GCN: NumVgprs: 24
; GCN: NumAgprs: 24
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 56
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 6
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 56
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
+; GFX908: TotalNumVgprs: 24
+; GFX90A: TotalNumVgprs: 48
+; GFX908: VGPRBlocks: 5
+; GFX90A: VGPRBlocks: 5
+; GFX908: NumVGPRsForWavesPerEU: 24
+; GFX90A: NumVGPRsForWavesPerEU: 48
+; GFX90A: AccumOffset: 24
+; GFX908: Occupancy: 10
; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" }
; HSA: attributes #18 = { nounwind }
; HSA: attributes #19 = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" }
-; HSA: attributes #20 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
+; HSA: attributes #20 = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" }
; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
; HSA: buffer_load_ubyte [[VAR:v[0-9]+]]
-; HSA-DAG: s_mov_b32 s32, 0
+; HSA: s_mov_b32 s32, 0
; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]]
; MESA-DAG: s_mov_b32 s32, 0{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12
-; MESA-DAG: v_bfe_i32 v0, v0, 0, 1
-; HSA: v_bfe_i32 v0, v3, 0, 1
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; FIXME: load should be scheduled before getpc
; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
-; HSA: buffer_load_ubyte v3
+; HSA: buffer_load_ubyte v0
; HSA-DAG: s_mov_b32 s32, 0{{$}}
; MESA: buffer_load_ubyte v0
; MESA-DAG: s_mov_b32 s32, 0{{$}}
-; MESA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; MESA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
-; MESA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
-; MESA-NEXT: v_and_b32_e32 v0, 1, v0
-; MESA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
-; MESA-NEXT: s_endpgm
-; HSA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; HSA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
-; HSA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
-; HSA-NEXT: v_and_b32_e32 v0, 1, v3
-; HSA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
-; HSA-NEXT: s_endpgm
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
%var = load volatile i1, i1 addrspace(1)* undef
call void @external_void_func_i1_zeroext(i1 %var)
; FIXME: don't wait before call
; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
-; MESA-DAG: buffer_load_sbyte v0
-; HSA-DAG: buffer_load_sbyte v3
+; GCN-DAG: buffer_load_sbyte v0
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
-; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
%var = load volatile i8, i8 addrspace(1)* undef
; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
-; MESA-DAG: buffer_load_ubyte v0
-; HSA-DAG: buffer_load_ubyte v3
+; GCN-DAG: buffer_load_ubyte v0
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
-; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
%var = load volatile i8, i8 addrspace(1)* undef
; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
-; MESA-DAG: buffer_load_sshort v0
-; HSA-DAG: buffer_load_sshort v3
+; GCN-DAG: buffer_load_sshort v0
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
-; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
%var = load volatile i16, i16 addrspace(1)* undef
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
-; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
%var = load volatile i16, i16 addrspace(1)* undef
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}}
-; GCN-NOT: v3,
+; GCN-NOT: v3
; GCN-DAG: v_mov_b32_e32 v0, 3
; GCN-DAG: v_mov_b32_e32 v1, 4
; GCN-DAG: v_mov_b32_e32 v2, 5
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; MESA-NOT: s_waitcnt
+; GCN-NOT: s_waitcnt
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
%ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt
-; MESA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
-; HSA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4
+; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
}
; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
-; MESA: buffer_load_ubyte v0, off
-; MESA-DAG: buffer_load_dword v1, off
-; HSA: buffer_load_ubyte v3, off
-; HSA-DAG: buffer_load_dword v4, off
-; MESA-NOT: s_waitcnt
+; GCN: buffer_load_ubyte v0, off
+; GCN: buffer_load_dword v1, off
+; GCN-NOT: s_waitcnt
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
}
; GCN-LABEL: {{^}}tail_call_byval_align16:
-; GCN-NOT: s32,
-; MESA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8
-; MESA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12
-; HSA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32
-; HSA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:24
+; GCN-NOT: s32
+; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8
+; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12
; GCN: s_getpc_b64
-; MESA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4
-; MESA: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
-; HSA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:16
-; HSA: buffer_store_dword [[VREG1]], off, s[0:3], s32
-; GCN-NOT: s32,
+; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
entry:
; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
; GCN-NOT: s32
-; MESA: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; MESA: buffer_load_dword v33, off, s[0:3], s32{{$}}
-; MESA: s_getpc_b64
-; MESA: buffer_store_dword v33, off, s[0:3], s32{{$}}
-; MESA: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; HSA: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; HSA: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; HSA: s_getpc_b64
-; HSA: buffer_store_dword v33, off, s[0:3], s32 offset:4
-; HSA: buffer_store_dword v32, off, s[0:3], s32 offset:8
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
+; GCN: s_getpc_b64
+; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
}
; GCN-LABEL: {{^}}stack_12xv3i32:
-; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; MESA: buffer_store_dword [[REG12]], {{.*$}}
-; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4
-; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8
-; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12
-; MESA: v_mov_b32_e32 v31, 11
-; MESA: s_getpc
-; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 11
-; HSA: buffer_store_dword [[REG12]], {{.*$}}
-; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4
-; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8
-; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12
-; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16
-; HSA: s_getpc
+; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
+; GCN: buffer_store_dword [[REG12]], {{.*$}}
+; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
+; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
+; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
+; GCN: v_mov_b32_e32 v31, 11
+; GCN: s_getpc
define void @stack_12xv3i32() #0 {
entry:
call void @external_void_func_12xv3i32(
}
; GCN-LABEL: {{^}}stack_12xv3f32:
-; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; MESA: buffer_store_dword [[REG12]], {{.*$}}
-; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4
-; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8
-; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12
-; MESA: v_mov_b32_e32 v31, 0x41300000
-; MESA: s_getpc
-; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4
-; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8
-; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12
-; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16
-; HSA: s_getpc
+; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
+; GCN: buffer_store_dword [[REG12]], {{.*$}}
+; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
+; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
+; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
+; GCN: v_mov_b32_e32 v31, 0x41300000
+; GCN: s_getpc
define void @stack_12xv3f32() #0 {
entry:
call void @external_void_func_12xv3f32(
; GCN-LABEL: {{^}}stack_8xv5i32:
-; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
-; MESA: buffer_store_dword [[REG8]], {{.*$}}
-; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
-; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4
-; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
-; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8
-; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12
-; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16
-; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20
-; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24
-; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28
-; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
-; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4
-; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
-; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8
-; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
-; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12
-; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16
-; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20
-; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24
-; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28
-; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32
-
-
-; MESA: v_mov_b32_e32 v31, 7
+; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
+; GCN: buffer_store_dword [[REG8]], {{.*$}}
+; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
+; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
+; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
+; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
+; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
+; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
+; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
+; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
+; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
+; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
+; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
+
+; GCN: v_mov_b32_e32 v31, 7
; GCN: s_getpc
define void @stack_8xv5i32() #0 {
entry:
}
; GCN-LABEL: {{^}}stack_8xv5f32:
-; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
-; MESA: buffer_store_dword [[REG8]], {{.*$}}
-; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
-; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4
-; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
-; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8
-; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12
-; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16
-; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20
-; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24
-; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28
-; MESA: v_mov_b32_e32 v31, 0x40e00000
-
-; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x40e00000
-; HSA: buffer_store_dword [[REG8]], {{.*$}}
-; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
-; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4
-; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
-; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8
-; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
-; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12
-; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16
-; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20
-; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24
-; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28
-; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32
+; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
+; GCN: buffer_store_dword [[REG8]], {{.*$}}
+; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
+; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
+; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
+; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
+; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
+; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
+; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
+; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
+; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
+; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
+; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
+
+; GCN: v_mov_b32_e32 v31, 0x40e00000
; GCN: s_getpc
define void @stack_8xv5f32() #0 {
entry:
; FIXME: Emitting unnecessary flat_scratch setup
; GCN-LABEL: {{^}}test_call_undef:
-; SDAG: s_mov_b32 flat_scratch_lo, s5
-; SDAG: s_add_u32 s4, s4, s7
+; SDAG: s_mov_b32 flat_scratch_lo, s11
+; SDAG: s_add_u32 s10, s10, s15
; SDAG: s_lshr_b32
; GCN: s_endpgm
define amdgpu_kernel void @test_call_undef() #0 {
}
; GCN-LABEL: {{^}}test_call_null:
-; SDAG: s_mov_b32 flat_scratch_lo, s5
-; SDAG: s_add_u32 s4, s4, s7
+; SDAG: s_mov_b32 flat_scratch_lo, s11
+; SDAG: s_add_u32 s10, s10, s15
; SDAG: s_lshr_b32
; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
-; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: s_setpc_b64
define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
; GCN: s_getpc_b64
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12
+; GCN: v_or_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v0, 9
; GCN: s_swappc_b64
; GCN: v_add_f32_e32
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 32
+; GCN: NumVgprs: 24
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 32
+; GCN: NumVgprs: 24
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 32
+; GCN: NumVgprs: 24
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
declare hidden void @external_void_func_void() #0
; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: s_getpc_b64 s[44:45]
-; GCN-NEXT: s_add_u32 s44, s44,
-; GCN-NEXT: s_addc_u32 s45, s45,
+; GCN: s_getpc_b64 s[34:35]
+; GCN-NEXT: s_add_u32 s34, s34,
+; GCN-NEXT: s_addc_u32 s35, s35,
; GCN-NEXT: s_mov_b32 s32, 0
-; GCN: s_swappc_b64 s[30:31], s[44:45]
+; GCN: s_swappc_b64 s[30:31], s[34:35]
-; GCN-DAG: #ASMSTART
-; GCN-DAG: #ASMEND
-; GCN-DAG: s_swappc_b64 s[30:31], s[44:45]
+; GCN-NEXT: #ASMSTART
+; GCN-NEXT: #ASMEND
+; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
call void @external_void_func_void()
call void asm sideeffect "", ""() #0
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; MUBUF: buffer_store_dword
; FLATSCR: scratch_store_dword
-; GCN: v_writelane_b32 v41, s33, 15
-; GCN-NEXT: v_writelane_b32 v41, s34, 0
-; GCN-NEXT: v_writelane_b32 v41, s35, 1
-; GCN-NEXT: v_writelane_b32 v41, s36, 2
-; GCN-NEXT: v_writelane_b32 v41, s37, 3
-; GCN-NEXT: v_writelane_b32 v41, s38, 4
-; GCN-NEXT: v_writelane_b32 v41, s39, 5
-; GCN-NEXT: v_writelane_b32 v41, s40, 6
-; GCN-NEXT: v_writelane_b32 v41, s41, 7
-; GCN-NEXT: v_writelane_b32 v41, s42, 8
-; GCN-NEXT: v_writelane_b32 v41, s43, 9
-; GCN-NEXT: v_writelane_b32 v41, s44, 10
-; GCN-NEXT: v_writelane_b32 v41, s46, 11
-; GCN-NEXT: v_writelane_b32 v41, s47, 12
-; GCN-NEXT: v_writelane_b32 v41, s30, 13
+; GCN: v_writelane_b32 v40, s33, 4
+; GCN: v_writelane_b32 v40, s34, 0
+; GCN: v_writelane_b32 v40, s35, 1
+; GCN: v_writelane_b32 v40, s30, 2
+; GCN: v_writelane_b32 v40, s31, 3
; GCN: s_swappc_b64
-; GCN-DAG: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
-
-; MUBUF-DAG: v_readlane_b32 s4, v41, 13
-; MUBUF-DAG: v_readlane_b32 s5, v41, 14
-; MUBUF-DAG: v_readlane_b32 s47, v41, 12
-; MUBUF-DAG: v_readlane_b32 s46, v41, 11
-; MUBUF-DAG: v_readlane_b32 s44, v41, 10
-; MUBUF-DAG: v_readlane_b32 s43, v41, 9
-; MUBUF-DAG: v_readlane_b32 s42, v41, 8
-; MUBUF-DAG: v_readlane_b32 s41, v41, 7
-; MUBUF-DAG: v_readlane_b32 s40, v41, 6
-; MUBUF-DAG: v_readlane_b32 s39, v41, 5
-; MUBUF-DAG: v_readlane_b32 s38, v41, 4
-; MUBUF-DAG: v_readlane_b32 s37, v41, 3
-; MUBUF-DAG: v_readlane_b32 s36, v41, 2
-; MUBUF-DAG: v_readlane_b32 s35, v41, 1
-; MUBUF-DAG: v_readlane_b32 s34, v41, 0
-
-; FLATSCR: v_readlane_b32 s0, v41, 13
-; FLATSCR-DAG: v_readlane_b32 s1, v41, 14
-; FLATSCR-DAG: v_readlane_b32 s47, v41, 12
-; FLATSCR-DAG: v_readlane_b32 s46, v41, 11
-; FLATSCR-DAG: v_readlane_b32 s44, v41, 10
-; FLATSCR-DAG: v_readlane_b32 s43, v41, 9
-; FLATSCR-DAG: v_readlane_b32 s42, v41, 8
-; FLATSCR-DAG: v_readlane_b32 s41, v41, 7
-; FLATSCR-DAG: v_readlane_b32 s40, v41, 6
-; FLATSCR-DAG: v_readlane_b32 s39, v41, 5
-; FLATSCR-DAG: v_readlane_b32 s38, v41, 4
-; FLATSCR-DAG: v_readlane_b32 s37, v41, 3
-; FLATSCR-DAG: v_readlane_b32 s36, v41, 2
-; FLATSCR-DAG: v_readlane_b32 s35, v41, 1
-; FLATSCR-DAG: v_readlane_b32 s34, v41, 0
-; FLATSCR-DAG: v_readlane_b32 s33, v41, 15
-
+; MUBUF-DAG: v_readlane_b32 s4, v40, 2
+; MUBUF-DAG: v_readlane_b32 s5, v40, 3
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
+; GCN: v_readlane_b32 s35, v40, 1
+; GCN: v_readlane_b32 s34, v40, 0
+
+; GCN: v_readlane_b32 s33, v40, 4
; MUBUF: buffer_load_dword
; FLATSCR: scratch_load_dword
; GCN: s_setpc_b64
}
; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
-; MUBUF: buffer_store_dword v41
-; GCN: v_writelane_b32 v41, s33, 15
+; MUBUF: buffer_store_dword v40
+; FLATSCR: scratch_store_dword off, v40
+; GCN: v_writelane_b32 v40, s33, 4
; GCN: s_mov_b32 s33, s32
-; FLATSCR: s_add_u32 s32, s32, 16
-; FLATSCR: scratch_store_dword off, v40
; MUBUF: s_add_u32 s32, s32, 0x400
+; FLATSCR: s_add_u32 s32, s32, 16
; GCN: s_swappc_b64
-; GCN-DAG: s_swappc_b64
+; GCN-NEXT: s_swappc_b64
-; GCN: v_readlane_b32 s33, v41, 15
-; MUBUF: buffer_load_dword v41
-; FLATSCR: scratch_load_dword v41
+; GCN: v_readlane_b32 s33, v40, 4
+; MUBUF: buffer_load_dword v40
+; FLATSCR: scratch_load_dword v40
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
; GCN: v_mov_b32_e32 v40, v31
-; GCN-DAG: s_swappc_b64
+; GCN-NEXT: s_swappc_b64
; GCN-NEXT: v_mov_b32_e32 v31, v40
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
; FIXME: What is the expected behavior for reserved registers here?
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; MUBUF: s_getpc_b64 s[18:19]
-; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
-; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
-; FLATSCR: s_getpc_b64 s[16:17]
-; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; FLATSCR: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN: #ASMSTART
; GCN-NEXT: ; def s33
; GCN-NEXT: #ASMEND
-; MUBUF: s_swappc_b64 s[30:31], s[18:19]
-; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN: ;;#ASMSTART
; GCN-NEXT: ; use s33
; GCN-NEXT: ;;#ASMEND
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
; GCN-NOT: s34
-; MUBUF: s_getpc_b64 s[18:19]
-; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
-; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
-; FLATSCR: s_getpc_b64 s[16:17]
-; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; FLATSCR: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: s34
; GCN-NEXT: ;;#ASMEND
; GCN-NOT: s34
-; MUBUF: s_swappc_b64 s[30:31], s[18:19]
-; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: s34
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
; GCN-NOT: v32
-; MUBUF: s_getpc_b64 s[18:19]
-; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
-; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
-; FLATSCR: s_getpc_b64 s[16:17]
-; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; FLATSCR: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: v40
; GCN-NEXT: ; def v40
; GCN-NEXT: ;;#ASMEND
-; MUBUF: s_swappc_b64 s[30:31], s[18:19]
-; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: v40
define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
; GCN-LABEL: call_memory_arg_load:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: s_load_dword s14, s[8:9], 0x0
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_add_u32 s8, s8, 8
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NEXT: ds_read_b32 v3, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v3
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_endpgm
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
call void @func(i32 %vgpr)
ret void
define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_memory_no_dep:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_add_u32 s8, s8, 16
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dword v3, v3, s[14:15]
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v0, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT: s_endpgm
store i32 0, i32 addrspace(1)* %ptr
call void @func(i32 0)
ret void
; Should not wait after the call before memory
define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call:
-; GCN: %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_add_u32 s8, s8, 16
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: v_mov_b32_e32 v40, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: global_store_dword v40, v40, s[34:35]
-; GCN-NEXT: s_endpgm
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: global_store_dword v40, v40, s[34:35]
+; GCN-NEXT: s_endpgm
call void @func(i32 0)
store i32 0, i32 addrspace(1)* %ptr
ret void
define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_add_u32 s8, s8, 16
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func.return@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func.return@rel32@hi+12
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: v_mov_b32_e32 v40, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: global_store_dword v40, v0, s[34:35]
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: global_store_dword v40, v0, s[34:35]
+; GCN-NEXT: s_endpgm
%rv = call i32 @func.return(i32 0)
store i32 %rv, i32 addrspace(1)* %ptr
ret void
define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_got_load:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_add_u32 s8, s8, 16
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: s_getpc_b64 s[14:15]
-; GCN-NEXT: s_add_u32 s14, s14, got.func@gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s15, s15, got.func@gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_endpgm
call void @got.func(i32 0)
ret void
}
define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: tailcall_got_load:
; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, got.func@gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, got.func@gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[16:17]
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[4:5]
tail call void @got.func(i32 0)
ret void
}
define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
; GCN-LABEL: tail_call_memory_arg_load:
; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: ds_read_b32 v0, v0
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, func@rel32@hi+12
-; GCN-NEXT: s_setpc_b64 s[16:17]
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
+; GCN-NEXT: s_setpc_b64 s[4:5]
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
tail call void @func(i32 %vgpr)
ret void
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}use_dispatch_ptr:
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
-; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
+; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_y:
; GCN: s_waitcnt
-; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_z:
; GCN: s_waitcnt
-; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_xy:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
-; GCN-NEXT: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_xyz:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
-; GCN-NEXT: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
-; GCN-NEXT: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_xz:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
-; GCN-NEXT: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_yz:
; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
-; GCN-NEXT: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
+; GCN-NOT: v0
; GCN: s_swappc_b64
+; GCN-NOT: v0
-; GCN: .amdhsa_system_vgpr_workitem_id 2
+; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
-; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NOT: v0
+; GCN-NOT: v1
+; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v1
; GCN: s_swappc_b64
-; GCN: .amdhsa_system_vgpr_workitem_id 2
+; GCN: .amdhsa_system_vgpr_workitem_id 1
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
call void @use_workitem_id_y()
ret void
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
-; UNPACKED-TID: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NOT: v0
+; GCN-NOT: v2
+; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v1
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
; UNPACKED-TID-NOT: v1
; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
+; GCN-NOT: v0
+; GCN-NOT: v1
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
call void @use_workitem_id_xy()
; UNPACKED-TID-NOT: v0
; UNPACKED-TID-NOT: v2
; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
-; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]]
+; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v2
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
call void @use_workitem_id_xz()
; UNPACKED-TID-NOT: v2
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
-; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
-; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]]
+; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
; GCN-NOT: v1
+; GCN-NOT: v2
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
call void @use_workitem_id_yz()
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]]
-; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, [[IDZ]]
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
; GCN-NOT: v1
; GCN-NOT: v2
; GCN: s_swappc_b64
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
; GCN: s_waitcnt
+; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
; GCN: s_waitcnt
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.y()
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
; GCN: s_waitcnt
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.z()
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
+; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN: s_swappc_b64
-; GCN: .amdhsa_system_vgpr_workitem_id 2
+; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
call void @other_arg_use_workitem_id_x(i32 555)
ret void
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
-; PACKED-TID: v_mov_b32_e32 v31, v0
+; PACKED-TID: v_mov_b32_e32 v1, v0
+; GCN-NOT: v1
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN-NOT: v1
; GCN: s_swappc_b64
; GCN-NOT: v0
-; GCN: .amdhsa_system_vgpr_workitem_id 2
+; GCN: .amdhsa_system_vgpr_workitem_id 1
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
call void @other_arg_use_workitem_id_y(i32 555)
ret void
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
-; PACKED-TID-DAG: v_mov_b32_e32 v31, v0
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2
+; PACKED-TID-DAG: v_mov_b32_e32 v1, v0
; GCN: s_swappc_b64
; GCN-NOT: v0
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
-; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GCN: v_and_b32_e32 v31, 0x3ff, v31
-; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v0
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_and_b32_e32 v32, 0x3ff, v32
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: s_setpc_b64
define void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s32, 0
-; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}}
-; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
-; GCN: .amdhsa_system_vgpr_workitem_id 2
+; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s33, s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
-; GFX7: buffer_load_dword v0, off, s[0:3], s32
-; GFX90A: buffer_load_dword v32, off, s[0:3], s32
+; GFX7: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX90A: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-DAG: s_waitcnt
; GFX7: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GFX90A: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32,
-; GFX7: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
-; GFX90A: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
+; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
+; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
; GCN: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN-DAG: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
; GCN-DAG: s_movk_i32 s32, 0x400
-; GFX7: buffer_store_dword v3, off, s[0:3], s32
-; GFX90A: buffer_store_dword v0, off, s[0:3], s32
+; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
-; GCN: .amdhsa_system_vgpr_workitem_id 2
+; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
-; GCN: buffer_store_dword v40, off, s[0:3], s32 offset:4
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GFX90A: v_and_b32_e32 v33, 0x3ff, v31
-; GFX90A: v_bfe_u32 v33, v31, 10, 10
-; GCN90A: v_bfe_u32 v31, v31, 20, 10
-; GFX7: v_and_b32_e32 v32, 0x3ff, v31
-; GFX7: v_bfe_u32 v32, v31, 10, 10
-; GCN7: v_bfe_u32 v31, v31, 20, 10
-; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}}
-; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v12
-; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v30{{$}}
-; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v0{{$}}
-; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v29, off{{$}}
-; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v30, off{{$}}
+; GFX90A: v_and_b32_e32 v33, 0x3ff, v32
+; GFX90A: v_bfe_u32 v34, v32, 10, 10
+; GCN90A: v_bfe_u32 v32, v32, 20, 10
+; GFX7: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GFX7: v_and_b32_e32 v33, 0x3ff, v32
+; GFX7: v_bfe_u32 v33, v32, 10, 10
+; GCN7: v_bfe_u32 v32, v32, 20, 10
+; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}}
+; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}}
+; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off{{$}}
+; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off{{$}}
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off{{$}}
+; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}}
+; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @too_many_args_use_workitem_id_xyz(
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
-; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2
; PACKED-TID-NOT: v0
+; PACKED-TID-NOT: v1
; PACKED-TID-NOT: v2
-; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}}
-; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s
+; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
-; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
+; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
+; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_y:
; GCN: s_waitcnt
-; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
+; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_z:
; GCN: s_waitcnt
-; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
+; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}use_workitem_id_xy:
; GCN: s_waitcnt
+; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-LABEL: {{^}}use_workitem_id_xyz:
; GCN: s_waitcnt
+; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-LABEL: {{^}}use_workitem_id_xz:
; GCN: s_waitcnt
+; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-LABEL: {{^}}use_workitem_id_yz:
; GCN: s_waitcnt
+; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
-; GCN: enable_vgpr_workitem_id = 2
+; VARABI: enable_vgpr_workitem_id = 0
+; FIXEDABI: enable_vgpr_workitem_id = 2
; FIXEDA-NOT: v0
+; VARABI-NOT: v31
; GCN: s_swappc_b64
-; GCN-NOT: v0
+; FIXEDABI-NOT: v0
+; VARABI-NOT: v31
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
-; GCN: enable_vgpr_workitem_id = 2
+; VARABI: enable_vgpr_workitem_id = 1
+; FIXEDABI: enable_vgpr_workitem_id = 2
-; GCN-NOT: v0
-; GCN-NOT: v1
+; FIXEDABI-NOT: v0
+; FIXEDABI-NOT: v1
+; VARABI-NOT: v31
+; VARABI: v_lshlrev_b32_e32 v0, 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
-; GCN-NOT: v0
-; GCN-NOT: v1
+; FIXEDABI-NOT: v0
+; FIXEDABI-NOT: v1
+; VARABI-NOT: v31
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
+; VARABI-NOT: v0
+; VARABI-NOT: v2
+; VARABI: v_lshlrev_b32_e32 v0, 20, v2
+; VARABI-NOT: v0
+; VARABI-NOT: v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
-
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; VARABI: v_or_b32_e32 v0, v0, [[IDY]]
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
+; VARABI-NOT: v0
+; VARABI-NOT: v2
+; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; VARABI: v_or_b32_e32 v0, v0, [[IDZ]]
+; VARABI-NOT: v0
+; VARABI-NOT: v2
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
+; VARABI-NOT: v1
+; VARABI-NOT: v2
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
-
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]]
+; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
+; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
+; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
+; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
+; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
+; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
+; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
-; GCN: enable_vgpr_workitem_id = 2
+; VARABI: enable_vgpr_workitem_id = 0
+; FIXEDABI: enable_vgpr_workitem_id = 2
+; VARABI: v_mov_b32_e32 v1, v0
+; VARABI: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
-
-
-; GCN: enable_vgpr_workitem_id = 2
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; VARABI: enable_vgpr_workitem_id = 1
+
+; VARABI: v_lshlrev_b32_e32 v1, 10, v1
+; VARABI-NOT: v1
+; VARABI: v_mov_b32_e32 v0, 0x22b
+; VARABI-NOT: v1
+; VARABI: s_swappc_b64
+; VARABI-NOT: v0
+
+; FIXEDABI: enable_vgpr_workitem_id = 2
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
call void @other_arg_use_workitem_id_y(i32 555)
ret void
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
+; VARABI-DAG: v_mov_b32_e32 v0, 0x22b
+; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2
+; VARABI: s_swappc_b64
+; VARABI-NOT: v0
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
call void @other_arg_use_workitem_id_z(i32 555)
ret void
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
+; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; VARABI: v_and_b32_e32 v32, 0x3ff, v32
+; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+; VARABI: s_setpc_b64
-; GCN: v_and_b32_e32 v31, 0x3ff, v31
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
+; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
+; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
define void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
}
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
+; VARABI: enable_vgpr_workitem_id = 0
+; VARABI: s_mov_b32 s32, 0
+; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; VARABI: s_swappc_b64
-; GCN: enable_vgpr_workitem_id = 2
-; GCN-DAG: s_mov_b32 s32, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+; FIXEDABI: enable_vgpr_workitem_id = 2
+; FIXEDABI-DAG: s_mov_b32 s32, 0
+; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
-; GCN: s_swappc_b64
+; FIXEDABI: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
+; VARABI: s_mov_b32 s33, s32
+; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}}
; Touching the workitem id register is not necessary.
-; GCN-NOT: v31
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
-; GCN-NOT: v31
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
-; GCN-NOT: v31
+; FIXEDABI-NOT: v31
+; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
+; FIXEDABI-NOT: v31
+; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+; FIXEDABI-NOT: v31
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
+; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VARABI-NEXT: s_waitcnt
+; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
+; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
+; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
+; VARABI: s_setpc_b64
-; GCN: v_and_b32_e32 v31, 0x3ff, v31
-; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
+; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
+; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
-; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
-; GCN: s_setpc_b64
+; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
+; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
+; FIXEDABI: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
; sp[2] = stack passed workitem ID x
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
+; VARABI: enable_vgpr_workitem_id = 0
+; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; VARABI: s_movk_i32 s32, 0x400{{$}}
+; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
+; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
+; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; VARABI: s_swappc_b64
-; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
-; GCN: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
-; GCN: s_movk_i32 s32, 0x400{{$}}
-; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
+; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
+; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
+; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
-; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
+; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; FIXME: Why this reload?
-; GCN: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
+; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
-; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
-; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
-; GCN-NOT: s32
-; GCN: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
-; GCN: s_swappc_b64
+; FIXEDABI-NOT: s32
+; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
+; FIXEDABI: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
+; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
+; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
+; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
+; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; VARABI: s_swappc_b64
; FIXED-ABI-NOT: v31
-; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
-; GCN: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
-; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
-; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
+; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
+; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
+; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
+; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
; FIXED-ABI-NOT: v31
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
; FIXED-ABI-NOT: v31
-; GCN: s_swappc_b64
+; FIXEDABI: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-
-
-
-; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
-; GCN-NOT: buffer_load_dword
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
-; GCN-NOT: buffer_load_dword
-; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
-; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
-; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
+; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
+; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; VARABI-NOT: buffer_load_dword
+
+; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32
+; VARABI-NOT: buffer_load_dword
+; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
+; VARABI-NOT: buffer_load_dword
+; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
+; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
+; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
+; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
+; VARABI: s_setpc_b64
+
+
+; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-NOT: buffer_load_dword
+; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
+; FIXEDABI-NOT: buffer_load_dword
+; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
+; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
+; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
+; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
define void @too_many_args_use_workitem_id_xyz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]]
+; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}}
-; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
+; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_u32 s12, s12, s17
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s17
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_add_u32 s4, s4, s7
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
-; GFX803-NEXT: s_mov_b32 s12, s14
-; GFX803-NEXT: s_mov_b32 s13, s15
-; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT: s_mov_b32 s14, s16
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT: s_getpc_b64 s[4:5]
+; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_mov_b32 s32, 0
-; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_call:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s17
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
-; GFX900-NEXT: s_mov_b32 s12, s14
-; GFX900-NEXT: s_mov_b32 s13, s15
-; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT: s_mov_b32 s14, s16
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX900-NEXT: s_getpc_b64 s[4:5]
+; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_mov_b32 s32, 0
-; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
-
+;
; GFX1010-LABEL: test_kern_call:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_add_u32 s12, s12, s17
-; GFX1010-NEXT: s_mov_b32 s32, 0
-; GFX1010-NEXT: s_addc_u32 s13, s13, 0
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_add_u32 s0, s0, s17
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
-; GFX1010-NEXT: s_mov_b32 s12, s14
-; GFX1010-NEXT: s_mov_b32 s13, s15
-; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT: s_mov_b32 s14, s16
-; GFX1010-NEXT: s_getpc_b64 s[18:19]
-; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
-; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX1010-NEXT: s_endpgm
+; GFX1010-NEXT: s_add_u32 s4, s4, s7
+; GFX1010-NEXT: s_mov_b32 s32, 0
+; GFX1010-NEXT: s_addc_u32 s5, s5, 0
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT: s_add_u32 s0, s0, s7
+; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_getpc_b64 s[4:5]
+; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
+; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT: s_endpgm
entry:
tail call void @ex() #0
ret void
define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_u32 s12, s12, s17
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s17
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_add_u32 s4, s4, s7
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
-; GFX803-NEXT: s_mov_b32 s12, s14
-; GFX803-NEXT: v_mov_b32_e32 v3, 0
-; GFX803-NEXT: s_mov_b32 s13, s15
-; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT: s_mov_b32 s14, s16
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX803-NEXT: v_mov_b32_e32 v0, 0
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT: s_getpc_b64 s[4:5]
+; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_movk_i32 s32, 0x400
-; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s17
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
-; GFX900-NEXT: s_mov_b32 s12, s14
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: s_mov_b32 s13, s15
-; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT: s_mov_b32 s14, s16
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_getpc_b64 s[4:5]
+; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_movk_i32 s32, 0x400
-; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
-
+;
; GFX1010-LABEL: test_kern_stack_and_call:
-; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_add_u32 s12, s12, s17
-; GFX1010-NEXT: s_movk_i32 s32, 0x200
-; GFX1010-NEXT: s_addc_u32 s13, s13, 0
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_add_u32 s0, s0, s17
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
-; GFX1010-NEXT: s_mov_b32 s12, s14
-; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT: s_mov_b32 s13, s15
-; GFX1010-NEXT: s_mov_b32 s14, s16
-; GFX1010-NEXT: s_getpc_b64 s[18:19]
-; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
-; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
-; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX1010-NEXT: s_endpgm
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_add_u32 s4, s4, s7
+; GFX1010-NEXT: s_movk_i32 s32, 0x200
+; GFX1010-NEXT: s_addc_u32 s5, s5, 0
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-NEXT: s_add_u32 s0, s0, s7
+; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_getpc_b64 s[4:5]
+; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
+; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_endpgm
-
+;
; GFX900-LABEL: test_force_fp_kern_empty:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_mov_b32 s33, 0
define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_u32 s12, s12, s17
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s17
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_add_u32 s4, s4, s7
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
-; GFX803-NEXT: s_mov_b32 s12, s14
-; GFX803-NEXT: s_mov_b32 s13, s15
-; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT: s_mov_b32 s14, s16
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT: s_getpc_b64 s[4:5]
+; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_mov_b32 s33, 0
-; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_call:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s17
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
-; GFX900-NEXT: s_mov_b32 s12, s14
-; GFX900-NEXT: s_mov_b32 s13, s15
-; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT: s_mov_b32 s14, s16
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX900-NEXT: s_getpc_b64 s[4:5]
+; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_mov_b32 s33, 0
-; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_force_fp_kern_call:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT s_add_u32 s12, s12, s17
-; GFX1010-NEXT s_mov_b32 s32, 0
-; GFX1010-NEXT s_mov_b32 s33, 0
-; GFX1010-NEXT s_addc_u32 s13, s13, 0
-; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2
-; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT s_add_u32 s0, s0, s17
-; GFX1010-NEXT s_addc_u32 s1, s1, 0
-; GFX1010-NEXT s_mov_b32 s12, s14
-; GFX1010-NEXT s_mov_b32 s13, s15
-; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT s_mov_b32 s14, s16
-; GFX1010-NEXT s_getpc_b64 s[18:19]
-; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12
-; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19]
-; GFX1010-NEXT s_endpgm
+; GFX1010-NEXT: s_add_u32 s4, s4, s7
+; GFX1010-NEXT: s_mov_b32 s32, 0
+; GFX1010-NEXT: s_mov_b32 s33, 0
+; GFX1010-NEXT: s_addc_u32 s5, s5, 0
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT: s_add_u32 s0, s0, s7
+; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_getpc_b64 s[4:5]
+; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
+; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT: s_endpgm
entry:
tail call void @ex() #2
ret void
define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_u32 s12, s12, s17
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s17
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
-; GFX803-NEXT: s_mov_b32 s12, s14
+; GFX803-NEXT: s_add_u32 s4, s4, s7
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_mov_b32 s33, 0
-; GFX803-NEXT: v_mov_b32_e32 v3, 0
-; GFX803-NEXT: s_mov_b32 s13, s15
-; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT: s_mov_b32 s14, s16
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: v_mov_b32_e32 v0, 0
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT: s_getpc_b64 s[4:5]
+; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_movk_i32 s32, 0x400
-; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s17
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
-; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b32 s33, 0
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: s_mov_b32 s13, s15
-; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT: s_mov_b32 s14, s16
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_getpc_b64 s[4:5]
+; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_movk_i32 s32, 0x400
-; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_add_u32 s12, s12, s17
-; GFX1010-NEXT: s_movk_i32 s32, 0x200
-; GFX1010-NEXT: s_mov_b32 s33, 0
-; GFX1010-NEXT: s_addc_u32 s13, s13, 0
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_add_u32 s0, s0, s17
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
-; GFX1010-NEXT: s_mov_b32 s12, s14
-; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT: s_mov_b32 s13, s15
-; GFX1010-NEXT: s_mov_b32 s14, s16
-; GFX1010-NEXT: s_getpc_b64 s[18:19]
-; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
-; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
-; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
-; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX1010-NEXT: s_endpgm
+; GFX1010-NEXT: s_add_u32 s4, s4, s7
+; GFX1010-NEXT: s_movk_i32 s32, 0x200
+; GFX1010-NEXT: s_mov_b32 s33, 0
+; GFX1010-NEXT: s_addc_u32 s5, s5, 0
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-NEXT: s_add_u32 s0, s0, s7
+; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_getpc_b64 s[4:5]
+; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
+; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
+; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
; GCN-LABEL: call_split_type_used_outside_block_v2f32:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-LABEL: call_split_type_used_outside_block_v3f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s4, v40, 0
-; GCN-NEXT: v_readlane_b32 s5, v40, 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x400
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[4:5]
+; GCN: ; %bb.0: ; %bb0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_readlane_b32 s4, v40, 0
+; GCN-NEXT: v_readlane_b32 s5, v40, 1
+; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[4:5]
+bb0:
%split.ret.type = call <3 x float> @func_v3f32()
br label %bb1
define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-LABEL: call_split_type_used_outside_block_v4f16:
; GCN: ; %bb.0: ; %bb0
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s4, v40, 0
-; GCN-NEXT: v_readlane_b32 s5, v40, 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x400
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[4:5]
-
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_readlane_b32 s4, v40, 0
+; GCN-NEXT: v_readlane_b32 s5, v40, 1
+; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call <4 x half> @func_v4f16()
br label %bb1
define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-LABEL: call_split_type_used_outside_block_struct:
; GCN: ; %bb.0: ; %bb0
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s4, v40, 0
-; GCN-NEXT: v_mov_b32_e32 v1, v4
-; GCN-NEXT: v_readlane_b32 s5, v40, 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x400
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_readlane_b32 s4, v40, 0
+; GCN-NEXT: v_mov_b32_e32 v1, v4
+; GCN-NEXT: v_readlane_b32 s5, v40, 1
+; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
br label %bb1
define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-LABEL: v3i16_registers:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s12, 1, s12
-; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1
-; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
-; GCN-NEXT: s_cbranch_vccnz BB4_2
-; GCN-NEXT: ; %bb.1: ; %if.else
-; GCN-NEXT: s_add_u32 s8, s8, 8
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: s_branch BB4_3
-; GCN-NEXT: BB4_2:
-; GCN-NEXT: s_mov_b32 s4, 0
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: BB4_3: ; %if.end
-; GCN-NEXT: global_store_short v[0:1], v1, off
-; GCN-NEXT: global_store_dword v[0:1], v0, off
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, 1, s4
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
+; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: s_cbranch_vccnz BB4_2
+; GCN-NEXT: ; %bb.1: ; %if.else
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_branch BB4_3
+; GCN-NEXT: BB4_2:
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: BB4_3: ; %if.end
+; GCN-NEXT: global_store_short v[0:1], v1, off
+; GCN-NEXT: global_store_dword v[0:1], v0, off
+; GCN-NEXT: s_endpgm
entry:
br i1 %cond, label %if.then, label %if.else
define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-LABEL: v3f16_registers:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s12, 1, s12
-; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1
-; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
-; GCN-NEXT: s_cbranch_vccnz BB5_2
-; GCN-NEXT: %bb.1: ; %if.else
-; GCN-NEXT: s_add_u32 s8, s8, 8
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT: s_branch BB5_3
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, 1, s4
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
+; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: s_cbranch_vccnz BB5_2
+; GCN-NEXT: ; %bb.1: ; %if.else
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_branch BB5_3
; GCN-NEXT: BB5_2:
-; GCN-NEXT: s_mov_b32 s4, 0
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: BB5_3: ; %if.end
+; GCN-NEXT: global_store_short v[0:1], v1, off
+; GCN-NEXT: global_store_dword v[0:1], v0, off
+; GCN-NEXT: s_endpgm
entry:
br i1 %cond, label %if.then, label %if.else
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: define internal void @indirect() #0 {
+define internal void @indirect() {
+ ret void
+}
+
+; GCN-LABEL: define internal void @direct() #1 {
+define internal void @direct() {
+ %fptr = alloca void()*
+ store void()* @indirect, void()** %fptr
+ %fp = load void()*, void()** %fptr
+ call void %fp()
+ ret void
+}
+
+; GCN-LABEL: define amdgpu_kernel void @test_direct_indirect_call() #2 {
+define amdgpu_kernel void @test_direct_indirect_call() {
+ call void @direct()
+ ret void
+}
+
+; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
+; attributes #2 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: define internal void @indirect() #0 {
+define internal void @indirect() {
+ ret void
+}
+
+; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 {
+define amdgpu_kernel void @test_simple_indirect_call() #0 {
+ %fptr = alloca void()*
+ store void()* @indirect, void()** %fptr
+ %fp = load void()*, void()** %fptr
+ call void %fp()
+ ret void
+}
+
+attributes #0 = { "amdgpu-dispatch-id" }
+
+; compiler modification to attributes
+attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+attributes #1 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN: %13:vgpr_32, %14:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
- ; GCN: %15:vgpr_32, %16:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
- ; GCN: %17:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
- ; GCN: %21:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
- ; GCN: %22:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec
- ; GCN: %23:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec
- ; GCN: %24:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec
- ; GCN: %25:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec
- ; GCN: %26:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
- ; GCN: $vcc = COPY %14
- ; GCN: %27:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec
- ; GCN: %28:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: $vcc = COPY %7
+ ; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+ ; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
- ; GCN: $vgpr0 = COPY %28
+ ; GCN: $vgpr0 = COPY %21
; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
entry:
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN: %13:vgpr_32, %14:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
- ; GCN: %15:vgpr_32, %16:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
- ; GCN: %17:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
- ; GCN: %21:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
- ; GCN: %22:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec
- ; GCN: %23:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec
- ; GCN: %24:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec
- ; GCN: %25:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec
- ; GCN: %26:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
+ ; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
- ; GCN: $vcc = COPY %14
- ; GCN: %27:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec
- ; GCN: %28:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: $vcc = COPY %7
+ ; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+ ; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
- ; GCN: $vgpr0 = COPY %28
+ ; GCN: $vgpr0 = COPY %21
; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
entry:
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: ; NumSgprs: 37
-; GCN: ; NumVgprs: 32
+; GCN: ; NumVgprs: 9
define amdgpu_kernel void @kernel_call() #0 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
tail call void @func()
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: ; NumSgprs: 32
-; GCN: ; NumVgprs: 32
+; GCN: ; NumVgprs: 9
define void @func_regular_call() #1 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
tail call void @func()
; GCN-LABEL: {{^}}func_tail_call:
; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16,
-; GCN-NEXT: s_addc_u32 s17,
-; GCN-NEXT: s_setpc_b64 s[16:17]
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4,
+; GCN-NEXT: s_addc_u32 s5,
+; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN: ; NumSgprs: 32
-; GCN: ; NumVgprs: 32
+; GCN: ; NumVgprs: 8
define void @func_tail_call() #1 {
tail call void @func()
ret void
; GCN: s_setpc_b64
; GCN: ; NumSgprs: 32
-; GCN: ; NumVgprs: 32
+; GCN: ; NumVgprs: 9
define void @func_call_tail_call() #1 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
tail call void @func()
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_write_b32 v0, v0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GCN-LABEL: {{^}}func_implicitarg_ptr:
; GCN: s_waitcnt
-; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
-; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @func_implicitarg_ptr() #0 {
; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
; GCN: s_waitcnt
-; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
-; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @opencl_func_implicitarg_ptr() #0 {
; HSA: kernarg_segment_byte_size = 112
; MESA: kernarg_segment_byte_size = 128
-; HSA: s_add_u32 s8, s8, 0x70
+; HSA: s_add_u32 s4, s4, 0x70
; MESA: s_add_u32 s4, s4, 0x70
-; HSA: s_addc_u32 s9, s9, 0{{$}}
-; MESA: s_addc_u32 s5, s5, 0{{$}}
+; GCN: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_implicitarg_ptr()
; HSA: kernarg_segment_byte_size = 160
; MESA: kernarg_segment_byte_size = 128
-; HSA: s_add_u32 s8, s8, 0x70
-; HSA: s_addc_u32 s9, s9, 0{{$}}
-; MESA: s_add_u32 s4, s4, 0x70
-; MESA: s_addc_u32 s5, s5, 0{{$}}
+; GCN: s_add_u32 s4, s4, 0x70
+; GCN: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
call void @func_implicitarg_ptr()
}
; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
-; HSA-NOT: s8
-; HSA-NOT: s9
-; HSA-NOT: s[8:9]
-; MESA-NOT: s4
-; MESA-NOT: s5
-; MESA-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
+; GCN-NOT: s[4:5]
define void @func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()
ret void
}
; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
-; HSA-NOT: s8
-; HSA-NOT: s9
-; HSA-NOT: s[8:9]
-; MESA-NOT: s4
-; MESA-NOT: s5
-; MESA-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
+; GCN-NOT: s[4:5]
define void @opencl_func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()
ret void
; GCN: s_waitcnt
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
-; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
-; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_waitcnt lgkmcnt(0)
define void @func_kernarg_implicitarg_ptr() #0 {
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
; GCN: s_waitcnt
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
-; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
-; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_waitcnt lgkmcnt(0)
define void @opencl_func_kernarg_implicitarg_ptr() #0 {
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
}
; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
-; HSA: s_add_u32 s8, s8, 0x70
-; HSA: s_addc_u32 s9, s9, 0
-; MESA: s_add_u32 s4, s4, 0x70
-; MESA: s_addc_u32 s5, s5, 0
+; GCN: s_add_u32 s4, s4, 0x70
+; GCN: s_addc_u32 s5, s5, 0
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_kernarg_implicitarg_ptr()
define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-LABEL: slsr1_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: v_writelane_b32 v44, s33, 15
-; GFX9-NEXT: v_writelane_b32 v44, s34, 0
-; GFX9-NEXT: v_writelane_b32 v44, s35, 1
-; GFX9-NEXT: v_writelane_b32 v44, s36, 2
-; GFX9-NEXT: v_writelane_b32 v44, s37, 3
-; GFX9-NEXT: v_writelane_b32 v44, s38, 4
-; GFX9-NEXT: v_writelane_b32 v44, s39, 5
-; GFX9-NEXT: v_writelane_b32 v44, s40, 6
-; GFX9-NEXT: v_writelane_b32 v44, s41, 7
-; GFX9-NEXT: v_writelane_b32 v44, s42, 8
-; GFX9-NEXT: v_writelane_b32 v44, s43, 9
-; GFX9-NEXT: v_writelane_b32 v44, s44, 10
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x800
-; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5]
-; GFX9-NEXT: v_writelane_b32 v44, s46, 11
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v44, s47, 12
-; GFX9-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-NEXT: v_mov_b32_e32 v42, v0
-; GFX9-NEXT: v_writelane_b32 v44, s30, 13
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41
-; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_writelane_b32 v44, s31, 14
-; GFX9-NEXT: v_mov_b32_e32 v40, v31
-; GFX9-NEXT: s_mov_b32 s42, s14
-; GFX9-NEXT: s_mov_b32 s43, s13
-; GFX9-NEXT: s_mov_b32 s44, s12
-; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11]
-; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
-; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
-; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43
-; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s44
-; GFX9-NEXT: s_mov_b32 s13, s43
-; GFX9-NEXT: s_mov_b32 s14, s42
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: v_mov_b32_e32 v0, v41
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
-; GFX9-NEXT: v_add_u32_e32 v0, v41, v43
-; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s44
-; GFX9-NEXT: s_mov_b32 s13, s43
-; GFX9-NEXT: s_mov_b32 s14, s42
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s4, v44, 13
-; GFX9-NEXT: v_readlane_b32 s5, v44, 14
-; GFX9-NEXT: v_readlane_b32 s47, v44, 12
-; GFX9-NEXT: v_readlane_b32 s46, v44, 11
-; GFX9-NEXT: v_readlane_b32 s44, v44, 10
-; GFX9-NEXT: v_readlane_b32 s43, v44, 9
-; GFX9-NEXT: v_readlane_b32 s42, v44, 8
-; GFX9-NEXT: v_readlane_b32 s41, v44, 7
-; GFX9-NEXT: v_readlane_b32 s40, v44, 6
-; GFX9-NEXT: v_readlane_b32 s39, v44, 5
-; GFX9-NEXT: v_readlane_b32 s38, v44, 4
-; GFX9-NEXT: v_readlane_b32 s37, v44, 3
-; GFX9-NEXT: v_readlane_b32 s36, v44, 2
-; GFX9-NEXT: v_readlane_b32 s35, v44, 1
-; GFX9-NEXT: v_readlane_b32 s34, v44, 0
-; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
-; GFX9-NEXT: v_readlane_b32 s33, v44, 15
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: v_writelane_b32 v43, s33, 4
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_add_u32 s32, s32, 0x800
+; GFX9-NEXT: v_writelane_b32 v43, s34, 0
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v43, s35, 1
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, v1
+; GFX9-NEXT: v_mov_b32_e32 v41, v0
+; GFX9-NEXT: v_writelane_b32 v43, s30, 2
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT: v_writelane_b32 v43, s31, 3
+; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
+; GFX9-NEXT: v_mov_b32_e32 v0, v40
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: v_readlane_b32 s4, v43, 2
+; GFX9-NEXT: v_readlane_b32 s5, v43, 3
+; GFX9-NEXT: v_readlane_b32 s35, v43, 1
+; GFX9-NEXT: v_readlane_b32 s34, v43, 0
+; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
+; GFX9-NEXT: v_readlane_b32 s33, v43, 4
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[4:5]
%b = and i32 %b.arg, 16777215
%s = and i32 %s.arg, 16777215
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s15, s33
+; CHECK-NEXT: s_mov_b32 s8, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_u32 s32, s32, 0x400
-; CHECK-NEXT: s_getpc_b64 s[18:19]
-; CHECK-NEXT: s_add_u32 s18, s18, callee_has_fp@rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s19, s19, callee_has_fp@rel32@hi+12
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: s_mov_b64 s[16:17], s[30:31]
-; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31]
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_sub_u32 s32, s32, 0x400
-; CHECK-NEXT: s_mov_b32 s33, s15
+; CHECK-NEXT: s_mov_b32 s33, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[16:17]
+; CHECK-NEXT: s_setpc_b64 s[6:7]
bb:
call fastcc void @callee_has_fp()
call void asm sideeffect "; clobber csr v40", "~{v40}"()
define amdgpu_kernel void @kernel_call() {
; CHECK-LABEL: kernel_call:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s17
-; CHECK-DAG: s_addc_u32 s1, s1, 0
-; CHECK-DAG: s_getpc_b64 s[18:19]
-; CHECK-DAG: s_add_u32 s18, s18, csr_vgpr_spill_fp_callee@rel32@lo+4
-; CHECK-DAG: s_addc_u32 s19, s19, csr_vgpr_spill_fp_callee@rel32@hi+12
-; CHECK-DAG: s_mov_b32 s32, 0
-; CHECK-DAG: s_swappc_b64 s[30:31], s[18:19]
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; CHECK-NEXT: s_add_u32 s0, s0, s7
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_callee()
; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: s_mov_b64 exec, s[16:17]
+; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: v_writelane_b32 v1, s33, 0
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
-; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: s_mov_b64 exec, s[18:19]
-; CHECK-NEXT: s_setpc_b64 s[16:17]
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_setpc_b64 s[4:5]
bb:
call void asm sideeffect "; clobber csr v40", "~{v40}"()
tail call fastcc void @callee_has_fp()
define amdgpu_kernel void @kernel_tailcall() {
; CHECK-LABEL: kernel_tailcall:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s17
-; CHECK-DAG: s_addc_u32 s1, s1, 0
-; CHECK-DAG: s_getpc_b64 s[18:19]
-; CHECK-NEXT: s_add_u32 s18, s18, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s19, s19, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; CHECK-NEXT: s_add_u32 s0, s0, s7
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
; CHECK-NEXT: s_mov_b32 s32, 0
-; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; MUBUF-NEXT: s_mov_b32 s32, s6
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
-; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; FLATSCR-NEXT: s_mov_b32 s32, s4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v4, s6
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; MUBUF-NEXT: v_mov_b32_e32 v5, s6
+; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; MUBUF-NEXT: s_mov_b32 s32, s6
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
-; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
-; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
+; FLATSCR-NEXT: v_mov_b32_e32 v5, 0
+; FLATSCR-NEXT: v_mov_b32_e32 v6, 1
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
-; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; FLATSCR-NEXT: s_mov_b32 s32, s2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; GCN: v_writelane_b32 v255, s33, 2
; GCN: v_writelane_b32 v255, s30, 0
; GCN: v_writelane_b32 v255, s31, 1
-; GCN: s_swappc_b64 s[30:31], s[16:17]
+; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN: v_readlane_b32 s30, v255, 0
; GCN: v_readlane_b32 s31, v255, 1
; GCN: v_readlane_b32 s33, v255, 2
; GCN: v_writelane_b32 v254, s33, 2
; GCN: v_writelane_b32 v254, s30, 0
; GCN: v_writelane_b32 v254, s31, 1
-; GCN: s_swappc_b64 s[30:31], s[16:17]
+; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN: v_readlane_b32 s30, v254, 0
; GCN: v_readlane_b32 s31, v254, 1
; GCN: v_readlane_b32 s33, v254, 2
; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call
; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32
; GCN-NOT: v_writelane
-; GCN: s_setpc_b64 s[16:17]
+; GCN: s_setpc_b64 s[4:5]
define void @reserve_vgpr_with_tail_call() #0 {
%alloca = alloca i32, align 4, addrspace(5)
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
+; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
+; GCN-NOT: s32
+
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
+; GCN-NOT: s32
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
; Have another non-tail in the function
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
; GCN: s_mov_b32 s33, s32
-; GCN-DAG: s_add_u32 s32, s32, 0x800
+; GCN-DAG: s_add_u32 s32, s32, 0x400
-; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 v43, s46, 12
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-DAG: v_writelane_b32 v42, s34, 0
+; GCN-DAG: v_writelane_b32 v42, s35, 1
; GCN-DAG: s_getpc_b64 s[4:5]
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
; GCN: s_swappc_b64
-; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
+; GCN: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
-; GCN-DAG: v_readlane_b32 s35, v43, 1
-; GCN-DAG: v_readlane_b32 s34, v43, 0
+; GCN-DAG: v_readlane_b32 s34, v42, 0
+; GCN-DAG: v_readlane_b32 s35, v42, 1
-; GCN: s_sub_u32 s32, s32, 0x800
+; GCN: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33,
-; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: s_setpc_b64 s[16:17]
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
entry:
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
; GCN-NOT: s33
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
-; GCN: s_setpc_b64 s[16:17]
+; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
%alloca = alloca [16 x i32], align 4, addrspace(5)
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
; GCN-NOT: s33
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
; GCN-NOT: s33
-; GCN: s_setpc_b64 s[16:17]
+; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
entry:
%alloca = alloca [16 x i32], align 4, addrspace(5)
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: define internal void @indirect() #0 {
+define internal void @indirect() {
+ ret void
+}
+
+; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 {
+define amdgpu_kernel void @test_simple_indirect_call() {
+ %fptr = alloca void()*
+ store void()* @indirect, void()** %fptr
+ %fp = load void()*, void()** %fptr
+ call void %fp()
+ ret void
+}
+
+; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=7 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}spill_csr_s5_copy:
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 v40, s33, 5
+; GCN: v_writelane_b32 v40, s33, 2
; GCN: s_swappc_b64
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
-; GCN: v_readlane_b32 s33, v40, 5
+; GCN: v_readlane_b32 s33, v40, 2
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN: s_mov_b64 exec
; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
; GCN-DAG: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
-; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
-; GCN: v_mov_b32_e32 v32, 0
; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
; GCN: s_mov_b32 s34, s32
+; GCN: v_mov_b32_e32 v32, 0
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 offset:4
; GCN-DAG: s_add_u32 s32, s32, 0x30000
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN: s_sub_u32 s32, s32, 0x30000
; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2
define hidden void @widget() {
; GCN-LABEL: widget:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccz BB0_3
-; GCN-NEXT: ; %bb.1: ; %bb4
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz BB0_4
-; GCN-NEXT: ; %bb.2: ; %bb7
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: s_branch BB0_7
-; GCN-NEXT: BB0_3: ; %bb2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz BB0_6
-; GCN-NEXT: BB0_4: ; %bb9
-; GCN-NEXT: s_getpc_b64 s[16:17]
-; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12
-; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
-; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execnz BB0_7
-; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: BB0_6: ; %bb12
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: flat_store_dword v[0:1], v2
-; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock
-; GCN-NEXT: v_readlane_b32 s4, v40, 0
-; GCN-NEXT: v_readlane_b32 s5, v40, 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x400
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: flat_load_dword v0, v[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: s_cbranch_vccz BB0_3
+; GCN-NEXT: ; %bb.1: ; %bb4
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz BB0_4
+; GCN-NEXT: ; %bb.2: ; %bb7
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_branch BB0_7
+; GCN-NEXT: BB0_3: ; %bb2
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz BB0_6
+; GCN-NEXT: BB0_4: ; %bb9
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execnz BB0_7
+; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: BB0_6: ; %bb12
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: flat_store_dword v[0:1], v2
+; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock
+; GCN-NEXT: v_readlane_b32 s4, v40, 0
+; GCN-NEXT: v_readlane_b32 s5, v40, 1
+; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[4:5]
; SI-OPT-LABEL: @widget(
; SI-OPT-NEXT: bb:
; SI-OPT-NEXT: [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16
; GCN-LABEL: blam:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v44, s33, 15
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v43, s33, 4
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x800
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v44, s34, 0
-; GCN-NEXT: v_writelane_b32 v44, s35, 1
-; GCN-NEXT: v_writelane_b32 v44, s36, 2
-; GCN-NEXT: v_writelane_b32 v44, s38, 3
-; GCN-NEXT: v_writelane_b32 v44, s39, 4
-; GCN-NEXT: v_writelane_b32 v44, s40, 5
-; GCN-NEXT: v_writelane_b32 v44, s41, 6
-; GCN-NEXT: v_writelane_b32 v44, s42, 7
-; GCN-NEXT: v_writelane_b32 v44, s43, 8
-; GCN-NEXT: v_writelane_b32 v44, s44, 9
-; GCN-NEXT: v_writelane_b32 v44, s45, 10
-; GCN-NEXT: v_writelane_b32 v44, s46, 11
-; GCN-NEXT: v_writelane_b32 v44, s47, 12
-; GCN-NEXT: v_writelane_b32 v44, s48, 13
-; GCN-NEXT: v_writelane_b32 v44, s49, 14
-; GCN-NEXT: v_mov_b32_e32 v40, v31
-; GCN-NEXT: s_mov_b32 s34, s14
-; GCN-NEXT: s_mov_b32 s35, s13
-; GCN-NEXT: s_mov_b32 s36, s12
-; GCN-NEXT: s_mov_b64 s[38:39], s[10:11]
-; GCN-NEXT: s_mov_b64 s[40:41], s[8:9]
-; GCN-NEXT: s_mov_b64 s[42:43], s[6:7]
-; GCN-NEXT: s_mov_b64 s[44:45], s[4:5]
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: v_writelane_b32 v43, s34, 0
+; GCN-NEXT: v_writelane_b32 v43, s35, 1
+; GCN-NEXT: v_writelane_b32 v43, s36, 2
+; GCN-NEXT: v_writelane_b32 v43, s37, 3
; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40
-; GCN-NEXT: flat_load_dword v41, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v43, 0
-; GCN-NEXT: s_getpc_b64 s[48:49]
-; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12
-; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT: flat_load_dword v40, v[1:2]
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: s_getpc_b64 s[36:37]
+; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12
+; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64 s[46:47], 0, v41
+; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40
; GCN-NEXT: s_branch BB1_3
-; GCN-NEXT: BB1_1: ; %bb10
-; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
+; GCN-NEXT: BB1_1: ; %bb10
+; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-NEXT: BB1_2: ; %bb18
-; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
+; GCN-NEXT: BB1_2: ; %bb18
+; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-NEXT: BB1_3: ; %bb2
-; GCN-NEXT: ; =>This Loop Header: Depth=1
-; GCN-NEXT: ; Child Loop BB1_4 Depth 2
+; GCN-NEXT: BB1_3: ; %bb2
+; GCN-NEXT: ; =>This Loop Header: Depth=1
+; GCN-NEXT: ; Child Loop BB1_4 Depth 2
; GCN-NEXT: s_mov_b64 s[6:7], 0
-; GCN-NEXT: BB1_4: ; %bb2
-; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
-; GCN-NEXT: ; => This Inner Loop Header: Depth=2
-; GCN-NEXT: flat_load_dword v0, v[42:43]
+; GCN-NEXT: BB1_4: ; %bb2
+; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
+; GCN-NEXT: ; => This Inner Loop Header: Depth=2
+; GCN-NEXT: flat_load_dword v0, v[41:42]
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_cbranch_execz BB1_6
-; GCN-NEXT: %bb.5: ; %bb8
-; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
+; GCN-NEXT: ; %bb.5: ; %bb8
+; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT: s_cbranch_execnz BB1_4
; GCN-NEXT: s_branch BB1_1
-; GCN-NEXT: BB1_6: ; %bb6
-; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
+; GCN-NEXT: BB1_6: ; %bb6
+; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB1_4
-; GCN-NEXT: %bb.7: ; %bb11
-; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
-; GCN-NEXT: _or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_mov_b64 s[4:5], s[44:45]
-; GCN-NEXT: s_mov_b64 s[6:7], s[42:43]
-; GCN-NEXT: s_mov_b64 s[8:9], s[40:41]
-; GCN-NEXT: s_mov_b64 s[10:11], s[38:39]
-; GCN-NEXT: s_mov_b32 s12, s36
-; GCN-NEXT: s_mov_b32 s13, s35
-; GCN-NEXT: s_mov_b32 s14, s34
-; GCN-NEXT: v_mov_b32_e32 v31, v40
-; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49]
-; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: s_mov_b64 s[6:7], 0
-; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT: s_cbranch_execnz BB1_4
-; GCN-NEXT: ; %bb.8: ; %bb14
-; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[46:47]
-; GCN-NEXT: s_cbranch_execnz BB1_10
-; GCN-NEXT: ; %bb.9: ; %bb16
-; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-NEXT: BB1_10: ; %bb17
-; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0
-; GCN-NEXT: s_branch BB1_2
-
+; GCN-NEXT: ; %bb.7: ; %bb11
+; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[36:37]
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: s_mov_b64 s[6:7], 0
+; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT: s_cbranch_execnz BB1_4
+; GCN-NEXT: ; %bb.8: ; %bb14
+; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
+; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35]
+; GCN-NEXT: s_cbranch_execnz BB1_10
+; GCN-NEXT: ; %bb.9: ; %bb16
+; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: BB1_10: ; %bb17
+; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0
+; GCN-NEXT: s_branch BB1_2
bb:
%tmp = load float, float* null, align 16
br label %bb2
; GFX9-NEXT: v_mov_b32_e32 v32, v12
; GFX9: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: v_writelane_b32 v44, s30, 0
; GFX9: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX10: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
-; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
-; GFX10: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
+; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10: buffer_load_dword v43, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX9-NEXT: v_mov_b32_e32 v40, v12
; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1
; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT: s_mov_b32 s37, s36
-; GFX10-NEXT: s_mov_b32 s38, s36
-; GFX10-NEXT: s_mov_b32 s39, s36
-; GFX10-NEXT: s_mov_b32 s40, s36
-; GFX10-NEXT: s_mov_b32 s41, s36
-; GFX10-NEXT: s_mov_b32 s42, s36
-; GFX10-NEXT: s_mov_b32 s43, s36
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: v_writelane_b32 v45, s30, 8
+
+; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v40, v16
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v41, v15
; GFX10-NEXT: v_mov_b32_e32 v42, v14
; GFX10-NEXT: v_mov_b32_e32 v43, v13
-; GFX10-NEXT: v_writelane_b32 v45, s31, 9
; GFX10-NEXT: v_mov_b32_e32 v44, v12
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10: buffer_load_dword v44, off, s[0:3], s33
# FULL-NEXT: stackPtrOffsetReg: '$sgpr13'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# FULL-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' }
-# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
+# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' }
# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
-# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# FULL-NEXT: workItemIDX: { reg: '$vgpr0' }
-# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
+# FULL-NEXT: workItemIDX: { reg: '$vgpr0' }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
# SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13'
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# SIMPLE-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' }
-# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
+# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' }
# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
-# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
-# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
+# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
name: kernel0
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
# SIMPLE-NEXT: maxKernArgAlign: 1
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
# SIMPLE-NEXT: maxKernArgAlign: 1
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
# SIMPLE-NEXT: isEntryFunction: true
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
-# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
-# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
# FULL: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# FULL-NEXT: flatScratchInit: { offset: 4 }
-# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
-# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
+# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
# SIMPLE: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
-# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
-# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# SIMPLE-NEXT: flatScratchInit: { offset: 4 }
-# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
-# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
-# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
-# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
-# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
-# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
-# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
+# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
name: fake_stack_arginfo
machineFunctionInfo:
argumentInfo: