From 42230efccf8fe1185be5fa6c23dce0a8183d6ec9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 19 Oct 2022 11:18:39 +0100 Subject: [PATCH] [DAG] Fold (sra (or (shl x, c1), (shl y, c2)), c1) -> (sext_inreg (or x, (shl y,c2-c1)) iff c2 >= c1 Helps with some of the AMDGPU regressions identified in D136042 where we were losing signed BFE patterns after sinking shifts behind logic ops. Differential Revision: https://reviews.llvm.org/D136081 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 44 +++++++++++++-------------- 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3313645..6b26621 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9433,12 +9433,49 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { TargetLowering::Legal) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), DAG.getValueType(ExtVT)); + // Even if we can't convert to sext_inreg, we might be able to remove // this shift pair if the input is already sign extended. if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue()) return N0.getOperand(0); } + // fold (sra (or (shl x, c1), (shl y, c2)), c1) + // -> (sext_inreg (or x, (shl y,c2-c1)) + // for some c1 and target supports sext_inreg. + if (N1C && N0.getOpcode() == ISD::OR && + N0.getOperand(0).getOpcode() == ISD::SHL && + N0.getOperand(1).getOpcode() == ISD::SHL && + (N1 == N0.getOperand(0).getOperand(1) || + N1 == N0.getOperand(1).getOperand(1)) && + N0->hasOneUse() && N0.getOperand(0)->hasOneUse() && + N0.getOperand(1)->hasOneUse()) { + ConstantSDNode *N00C = isConstOrConstSplat(N0.getOperand(0).getOperand(1)); + ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1).getOperand(1)); + if (N00C && N01C && + N00C->getAPIntValue().uge(N1C->getZExtValue()) && + N01C->getAPIntValue().uge(N1C->getZExtValue())) { + unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); + if (VT.isVector()) + ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, + VT.getVectorElementCount()); + if (!LegalOperations || + TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) == + TargetLowering::Legal) { + // Apply SRL on top of the SHL nodes, SimplifyDemandedBits will clean + // this up. It looks messy but its a lot simpler than handling all the + // possible shift value type mismatches we could have.... + SDLoc DL(N); + SDValue LHS = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); + SDValue RHS = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(1), N1); + SDValue Or = DAG.getNode(ISD::OR, DL, VT, LHS, RHS); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Or, + DAG.getValueType(ExtVT)); + } + } + } + // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) // clamp (add c1, c2) to max shift. if (N0.getOpcode() == ISD::SRA) { diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index 1c72c08..fe242b6 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -440,40 +440,38 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(i32 addrspace(1)* %out, ret void } -; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1 +; ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1 define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x, i32 addrspace(1)* %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[6:7], 0x0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 17 -; SI-NEXT: s_lshl_b32 s0, s0, 19 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_ashr_i32 s0, s0, 17 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_lshl_b32 s4, s4, 2 +; SI-NEXT: s_or_b32 s2, s2, s4 +; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s2, 17 -; VI-NEXT: s_lshl_b32 s0, s0, 19 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: s_ashr_i32 s0, s0, 17 +; VI-NEXT: s_lshl_b32 s0, s4, 2 +; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -- 2.7.4