AMDGPU: Fix indirect tail calls

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 20 Apr 2021 18:44:41 +0000 (14:44 -0400)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 21 Apr 2021 13:15:24 +0000 (09:15 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 20 Apr 2021 18:44:41 +0000 (14:44 -0400)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 21 Apr 2021 13:15:24 +0000 (09:15 -0400)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 785f219..ae68c96 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2851,6 +2851,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
    if (!mayTailCallThisCC(CalleeCC))
      return false;
  
+  // For a divergent call target, we need to do a waterfall loop over the
+  // possible callees which precludes us from using a simple jump.
+  if (Callee->isDivergent())
+    return false;
+
    MachineFunction &MF = DAG.getMachineFunction();
    const Function &CallerF = MF.getFunction();
    CallingConv::ID CallerCC = CallerF.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 381f262..6a4f984 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -492,6 +492,11 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
    let isConvergent = 1;
  }
  
+// Handle selecting indirect tail calls
+def : GCNPat<
+  (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)),
+  (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff)
+>;
  
  def ADJCALLSTACKUP : SPseudoInstSI<
    (outs), (ins i32imm:$amt0, i32imm:$amt1),
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll

index c8aea66..a6a1061 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -272,5 +272,35 @@ entry:
    ret i32 %ret
  }
  
+@func_ptr_gv = external unnamed_addr addrspace(4) constant i32(i32, i32)*, align 4
+
+; Do support tail calls with a uniform, but unknown, callee.
+; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32:
+; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]]
+; GCN: s_setpc_b64 [[FUNC_PTR]]
+define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %func.ptr.load = load i32(i32, i32)*, i32(i32, i32)* addrspace(4)* @func_ptr_gv
+  %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
+  ret i32 %ret
+}
+
+; We can't support a tail call to a divergent target. Use a waterfall
+; loop around a regular call
+; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; GCN: v_readfirstlane_b32
+; GCN: v_readfirstlane_b32
+; GCN: s_and_saveexec_b64
+; GCN: s_swappc_b64
+; GCN: s_cbranch_execnz
+; GCN: s_setpc_b64
+define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(i32(i32, i32)* %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %add = add i32 %b, %c
+  %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
+  ret i32 %ret
+}
+
  attributes #0 = { nounwind }
  attributes #1 = { nounwind noinline }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 20 Apr 2021 18:44:41 +0000 (14:44 -0400)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 21 Apr 2021 13:15:24 +0000 (09:15 -0400)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sibling-call.ll		patch \| blob \| history