v2: More tests, bug fixes, cosmetic changes.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D31762
llvm-svn: 301677
GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;
+// Set EXEC to the 64-bit value given.
+// This is always moved to the beginning of the basic block.
+def int_amdgcn_init_exec : Intrinsic<[],
+ [llvm_i64_ty], // 64-bit literal constant
+ [IntrConvergent]>;
+
+// Set EXEC according to a thread count packed in an SGPR input:
+// thread_count = (input >> bitoffset) & 0x7f;
+// This is always moved to the beginning of the basic block.
+def int_amdgcn_init_exec_from_input : Intrinsic<[],
+ [llvm_i32_ty, // 32-bit SGPR input
+ llvm_i32_ty], // bit offset of the thread count
+ [IntrConvergent]>;
+
+
//===----------------------------------------------------------------------===//
// Instruction Intrinsics
//===----------------------------------------------------------------------===//
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+ NODE_NAME_CASE(INIT_EXEC)
+ NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
NODE_NAME_CASE(SENDMSG)
NODE_NAME_CASE(SENDMSGHALT)
NODE_NAME_CASE(INTERP_MOV)
BUILD_VERTICAL_VECTOR,
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
+ INIT_EXEC,
+ INIT_EXEC_FROM_INPUT,
SENDMSG,
SENDMSGHALT,
INTERP_MOV,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
+ SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
+ SDTypeProfile<0, 2,
+ [SDTCisInt<0>, SDTCisInt<1>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
MI.eraseFromParent();
return BB;
+ case AMDGPU::SI_INIT_EXEC:
+ // This should be before all vector instructions.
+ BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+ AMDGPU::EXEC)
+ .addImm(MI.getOperand(0).getImm());
+ MI.eraseFromParent();
+ return BB;
+
+ case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ MachineInstr *FirstMI = &*BB->begin();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned InputReg = MI.getOperand(0).getReg();
+ unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ bool Found = false;
+
+ // Move the COPY of the input reg to the beginning, so that we can use it.
+ for (auto I = BB->begin(); I != &MI; I++) {
+ if (I->getOpcode() != TargetOpcode::COPY ||
+ I->getOperand(0).getReg() != InputReg)
+ continue;
+
+ if (I == FirstMI) {
+ FirstMI = &*++BB->begin();
+ } else {
+ I->removeFromParent();
+ BB->insert(FirstMI, &*I);
+ }
+ Found = true;
+ break;
+ }
+ assert(Found);
+
+ // This should be before all vector instructions.
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
+ AMDGPU::EXEC)
+ .addReg(CountReg)
+ .addImm(0);
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(64);
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
+ AMDGPU::EXEC)
+ .addImm(-1);
+ MI.eraseFromParent();
+ return BB;
+ }
+
case AMDGPU::GET_GROUPSTATICSIZE: {
DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_init_exec: {
+ return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
+ Op.getOperand(2));
+ }
+ case Intrinsic::amdgcn_init_exec_from_input: {
+ return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
+ Op.getOperand(2), Op.getOperand(3));
+ }
case AMDGPUIntrinsic::SI_tbuffer_store: {
SDValue Ops[] = {
Chain,
let isReMaterializable = 1;
}
+def SI_INIT_EXEC : SPseudoInstSI <
+ (outs), (ins i64imm:$src), []> {
+ let Defs = [EXEC];
+ let usesCustomInserter = 1;
+ let isAsCheapAsAMove = 1;
+}
+
+def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
+ (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+ let Defs = [EXEC];
+ let usesCustomInserter = 1;
+}
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
} // End SubtargetPredicate = isGCN
let Predicates = [isGCN] in {
+def : Pat <
+ (AMDGPUinit_exec i64:$src),
+ (SI_INIT_EXEC (as_i64imm $src))
+>;
+
+def : Pat <
+ (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
+ (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
+>;
+
def : Pat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
--- /dev/null
+;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN
+
+; GCN-LABEL: {{^}}full_mask:
+; GCN: s_mov_b64 exec, -1
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @full_mask(float %a, float %b) {
+main_body:
+ %s = fadd float %a, %b
+ call void @llvm.amdgcn.init.exec(i64 -1)
+ ret float %s
+}
+
+; GCN-LABEL: {{^}}partial_mask:
+; GCN: s_mov_b64 exec, 0x1e240
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @partial_mask(float %a, float %b) {
+main_body:
+ %s = fadd float %a, %b
+ call void @llvm.amdgcn.init.exec(i64 123456)
+ ret float %s
+}
+
+; GCN-LABEL: {{^}}input_s3off8:
+; GCN: s_bfe_u32 s0, s3, 0x70008
+; GCN: s_bfm_b64 exec, s0, 0
+; GCN: s_cmp_eq_u32 s0, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
+main_body:
+ %s = fadd float %a, %b
+ call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
+ ret float %s
+}
+
+; GCN-LABEL: {{^}}input_s0off19:
+; GCN: s_bfe_u32 s0, s0, 0x70013
+; GCN: s_bfm_b64 exec, s0, 0
+; GCN: s_cmp_eq_u32 s0, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_f32_e32 v0,
+define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
+main_body:
+ %s = fadd float %a, %b
+ call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
+ ret float %s
+}
+
+; GCN-LABEL: {{^}}reuse_input:
+; GCN: s_bfe_u32 s1, s0, 0x70013
+; GCN: s_bfm_b64 exec, s1, 0
+; GCN: s_cmp_eq_u32 s1, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_i32_e32 v0, vcc, s0, v0
+define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
+main_body:
+ call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
+ %s = add i32 %a, %count
+ %f = sitofp i32 %s to float
+ ret float %f
+}
+
+; GCN-LABEL: {{^}}reuse_input2:
+; GCN: s_bfe_u32 s1, s0, 0x70013
+; GCN: s_bfm_b64 exec, s1, 0
+; GCN: s_cmp_eq_u32 s1, 64
+; GCN: s_cmov_b64 exec, -1
+; GCN: v_add_i32_e32 v0, vcc, s0, v0
+define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
+main_body:
+ %s = add i32 %a, %count
+ %f = sitofp i32 %s to float
+ call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
+ ret float %f
+}
+
+declare void @llvm.amdgcn.init.exec(i64) #1
+declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
+
+attributes #1 = { convergent }