From 2af1640f9aa4ebe5d447586bbdad6514312bb814 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 7 Feb 2020 21:33:39 -0800 Subject: [PATCH] [LegalizeDAG][X86][AMDGPU] Use ANY_EXTEND instead of ZERO_EXTEND when promoting ISD::CTTZ/CTTZ_ZERO_UNDEF. Summary: For CTTZ we place a set bit just past where the non-promoted type stopped so the extended bits won't be used for the count. For CTTZ_ZERO_UNDEF we don't care what happens if no bits are set in the original type and we end up counting into the extended bits. So we can just use ANY_EXTEND for both cases. This matches what is done in type legalization for these operations. We make no effort to force the upper bits to zero. Differential Revision: https://reviews.llvm.org/D74111 --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 9 +++++++-- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 6 +++--- llvm/test/CodeGen/X86/clz.ll | 21 ++++++++------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5a23573d..d03f765 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4342,8 +4342,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: - // Zero extend the argument. - Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0)); + // Zero extend the argument unless its cttz, then use any_extend. + if (Node->getOpcode() == ISD::CTTZ || + Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF) + Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0)); + else + Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0)); + if (Node->getOpcode() == ISD::CTTZ) { // The count is the same in the promoted type except if the original // value was zero. This can be handled by setting the bit just off diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index c5e7a73..626ef15 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* n ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select: ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_sdwa +; SI-SDWA: v_ffbl_b32_e32 ; EG: MEM_RAT MSKOR define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { %val = load i8, i8 addrspace(1)* %arrayidx, align 1 @@ -144,7 +144,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noa ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select: ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_sdwa +; SI-SDWA: v_ffbl_b32_e32 ; EG: MEM_RAT MSKOR define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { %val = load i16, i16 addrspace(1)* %arrayidx, align 1 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias ; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1: ; SI: {{buffer|flat}}_load_ubyte ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_sdwa +; SI-SDWA: v_ffbl_b32_e32 ; EG: MEM_RAT MSKOR ; EG: FFBL_INT define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll index f76f5b1..7884f3e 100644 --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -17,29 +17,25 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i8 @cttz_i8(i8 %x) { ; X32-LABEL: cttz_i8: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: bsfl %eax, %eax +; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; ; X64-LABEL: cttz_i8: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsfl %eax, %eax +; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-CLZ-LABEL: cttz_i8: ; X32-CLZ: # %bb.0: -; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-CLZ-NEXT: tzcntl %eax, %eax +; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: cttz_i8: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: movzbl %dil, %eax -; X64-CLZ-NEXT: tzcntl %eax, %eax +; X64-CLZ-NEXT: tzcntl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true ) @@ -503,17 +499,16 @@ define i8 @cttz_i8_zero_test(i8 %n) { ; ; X32-CLZ-LABEL: cttz_i8_zero_test: ; X32-CLZ: # %bb.0: -; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100 +; X32-CLZ-NEXT: movl $256, %eax # imm = 0x100 +; X32-CLZ-NEXT: orl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: tzcntl %eax, %eax ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: cttz_i8_zero_test: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: movzbl %dil, %eax -; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100 -; X64-CLZ-NEXT: tzcntl %eax, %eax +; X64-CLZ-NEXT: orl $256, %edi # imm = 0x100 +; X64-CLZ-NEXT: tzcntl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false) -- 2.7.4