From 8020458c5dc2be841c07d26ff75b5471314e6631 Mon Sep 17 00:00:00 2001 From: alex-t Date: Fri, 24 Dec 2021 01:01:07 +0300 Subject: [PATCH] [AMDGPU] Changing S_AND_B32 to V_AND_B32_e64 in the divergent 'trunc' to i1 pattern In 'trunc' i16/32/64 to i1 pattern the 'and $src, 1' node supply operand to 'setcc'. The latter is selected to S_CMP_EQ/V_CMP_EQ dependent on the divergence. In case the 'and' is scalar and 'setcc' is divergent, we need VGPR to SGPR copy to adjust input operand for V_CMP_EQ. This patch changes the S_AND_B32 to V_AND_B32_e64 in the 'trunc to i1' divergent patterns. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D116241 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 12 ++--- llvm/lib/Target/AMDGPU/VOPInstructions.td | 12 +++++ .../AMDGPU/divergence-driven-trunc-to-i1.ll | 59 ++++++++++++++++++++++ 3 files changed, 77 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 95744b6..636337e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2212,18 +2212,18 @@ def : GCNPat < >; def : GCNPat < - (i1 (trunc i32:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (i1 (DivergentUnaryFrag i32:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; def : GCNPat < - (i1 (trunc i16:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (i1 (DivergentUnaryFrag i16:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; def : GCNPat < - (i1 (trunc i64:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), + (i1 (DivergentUnaryFrag i64:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index a3eccf1..a836889 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -794,6 +794,18 @@ class VOPPatGen { list ret = [!con(Outs, (set Ins))]; } +class DivergentUnaryFrag : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + class VOPPatOrNull { list ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen.ret, []); } diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll new file mode 100644 index 0000000..4429ee6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -0,0 +1,59 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: name: uniform_trunc_i16_to_i1 +; GCN: S_AND_B32 1 +; GCN: S_CMP_EQ_U32 +define amdgpu_kernel void @uniform_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) { + %setcc = icmp slt i16 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + store i1 %select, i1 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_trunc_i16_to_i1 +; GCN: V_AND_B32_e64 1 +; GCN: V_CMP_EQ_U32_e64 +define i1 @divergent_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) { + %setcc = icmp slt i16 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + ret i1 %select +} + +; GCN-LABEL: name: uniform_trunc_i32_to_i1 +; GCN: S_AND_B32 1 +; GCN: S_CMP_EQ_U32 +define amdgpu_kernel void @uniform_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) { + %setcc = icmp slt i32 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + store i1 %select, i1 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_trunc_i32_to_i1 +; GCN: V_AND_B32_e64 1 +; GCN: V_CMP_EQ_U32_e64 +define i1 @divergent_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) { + %setcc = icmp slt i32 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + ret i1 %select +} + +; GCN-LABEL: name: uniform_trunc_i64_to_i1 +; GCN: S_AND_B32 1 +; GCN: S_CMP_EQ_U32 +define amdgpu_kernel void @uniform_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x, i1 %z) { + %setcc = icmp slt i64 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + store i1 %select, i1 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_trunc_i64_to_i1 +; GCN: V_AND_B32_e64 1 +; GCN: V_CMP_EQ_U32_e64 +define i1 @divergent_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x, i1 %z) { + %setcc = icmp slt i64 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + ret i1 %select +} + -- 2.7.4