Register AndMaskSrc = LiveIn;
+ // TODO: Avoid clearing the high bits if we know workitem id y/z are always
+ // 0.
if (Shift != 0) {
auto ShiftAmt = B.buildConstant(S32, Shift);
AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
return true;
}
+static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
+ B.buildConstant(MI.getOperand(0).getReg(), C);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
case Intrinsic::amdgcn_implicitarg_ptr:
return legalizeImplicitArgPtr(MI, MRI, B);
case Intrinsic::amdgcn_workitem_id_x:
+ if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
+ return replaceWithConstant(B, MI, 0);
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
case Intrinsic::amdgcn_workitem_id_y:
+ if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
+ return replaceWithConstant(B, MI, 0);
+
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
case Intrinsic::amdgcn_workitem_id_z:
+ if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
+ return replaceWithConstant(B, MI, 0);
+
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
case Intrinsic::amdgcn_workgroup_id_x:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x:
+ if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
+ if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
+ if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
+ return DAG.getConstant(0, DL, MVT::i32);
+
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
ret void
}
+; FIXME: Should be able to avoid enabling in kernel inputs
+; FIXME: Packed tid should avoid the and
+; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only:
+; CO-V2: enable_vgpr_workitem_id = 2
+
+; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+
+; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0
+; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %id.x, i32* %out
+ store volatile i32 %id.y, i32* %out
+ store volatile i32 %id.z, i32* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only:
+; CO-V2: enable_vgpr_workitem_id = 2
+
+; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+
+; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+
+; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10
+; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %id.x, i32* %out
+ store volatile i32 %id.y, i32* %out
+ store volatile i32 %id.z, i32* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only:
+; CO-V2: enable_vgpr_workitem_id = 2
+
+; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+
+; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2
+
+; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
+; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %id.x, i32* %out
+ store volatile i32 %id.y, i32* %out
+ store volatile i32 %id.z, i32* %out
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
+
+!0 = !{i32 64, i32 1, i32 1}
+!1 = !{i32 1, i32 64, i32 1}
+!2 = !{i32 1, i32 1, i32 64}
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
ret void
}
+; FIXME: Should be able to avoid enabling in kernel inputs
+; FIXME: Packed tid should avoid the and
+; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only:
+; CO-V2: enable_vgpr_workitem_id = 2
+
+; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+
+; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0
+; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %id.x, i32* %out
+ store volatile i32 %id.y, i32* %out
+ store volatile i32 %id.z, i32* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only:
+; CO-V2: enable_vgpr_workitem_id = 2
+
+; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+
+; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+
+; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10
+; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %id.x, i32* %out
+ store volatile i32 %id.y, i32* %out
+ store volatile i32 %id.z, i32* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only:
+; CO-V2: enable_vgpr_workitem_id = 2
+
+; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
+
+; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2
+
+; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
+; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %id.x, i32* %out
+ store volatile i32 %id.y, i32* %out
+ store volatile i32 %id.z, i32* %out
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
+
+!0 = !{i32 64, i32 1, i32 1}
+!1 = !{i32 1, i32 64, i32 1}
+!2 = !{i32 1, i32 1, i32 64}