From 9356097206a17eb781180d5f895b52f8b0333f8e Mon Sep 17 00:00:00 2001 From: "chenglin.bi" Date: Wed, 19 Apr 2023 11:15:14 +0800 Subject: [PATCH] Revert "[AMDGPU] Ressociate patterns with sub to use SALU" The patch will caused dead loop because of DAGCombiner's canonicalization: // (x + C) - y -> (x - y) + C // y - (x + C) -> (y - x) - C // (x - C) - y -> (x - y) - C // (C - x) - y -> C - (x + y) This reverts commit b3529b5bf3ba2cd7f38665de16450afefb263c9b. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 96 ------- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 - llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll | 12 +- llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll | 305 ----------------------- 4 files changed, 6 insertions(+), 408 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 26f3404..966d103 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11131,96 +11131,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N, return DAG.getNode(Opc, SL, VT, Add1, Op2); } -SDValue SITargetLowering::reassociateSub(SDNode *N, SelectionDAG &DAG) const { - EVT VT = N->getValueType(0); - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); - - if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) - return SDValue(); - - unsigned Opc = N->getOpcode(); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - if (!(Op0->isDivergent() ^ Op1->isDivergent())) - return SDValue(); - - SDLoc SL(N); - if (Op1->isDivergent() && Op1->hasOneUse()) { - unsigned Op1Opc = Op1.getOpcode(); - if (Op1Opc != ISD::ADD && Op1Opc != ISD::SUB) - return SDValue(); - - SDValue Op2 = Op1.getOperand(1); - Op1 = Op1.getOperand(0); - if (Opc == ISD::ADD && Op1Opc == ISD::SUB) { - // s0 + (s1 - v0) --> (s0 + s1) - v0 - if (!Op1->isDivergent() && Op2->isDivergent()) - return DAG.getNode(ISD::SUB, SL, VT, - DAG.getNode(ISD::ADD, SL, VT, Op0, Op1), Op2); - // s0 + (v0 - s1) --> (s0 - s1) + v0 - if (Op1->isDivergent() && !Op2->isDivergent()) - return DAG.getNode(ISD::ADD, SL, VT, - DAG.getNode(ISD::SUB, SL, VT, Op0, Op2), Op1); - } else if (Opc == ISD::SUB) { - if (Op1Opc == ISD::SUB) { - // s0 - (s1 - v0) --> (s0 - s1) + v0 - if (!Op1->isDivergent() && Op2->isDivergent()) - return DAG.getNode(ISD::ADD, SL, VT, - DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); - // s0 - (v0 - s1) --> (s0 + s1) - v0 - if (Op1->isDivergent() && !Op2->isDivergent()) - return DAG.getNode(ISD::SUB, SL, VT, - DAG.getNode(ISD::ADD, SL, VT, Op0, Op2), Op1); - } else if (Op1Opc == ISD::ADD) { - // s0 - (s1 + v0) --> (s0 - s1) - v0 - if (Op1->isDivergent() ^ Op2->isDivergent()) { - if (Op1->isDivergent()) - std::swap(Op1, Op2); - return DAG.getNode(ISD::SUB, SL, VT, - DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); - } - } - } - } - - if (Op0->isDivergent() && Op0->hasOneUse()) { - unsigned Op0Opc = Op0.getOpcode(); - if (Op0Opc != ISD::ADD && Op0Opc != ISD::SUB) - return SDValue(); - - SDValue Op2 = Op0.getOperand(1); - Op0 = Op0.getOperand(0); - if (!Op0->isDivergent() && Op2->isDivergent()) { - if (Opc == ISD::SUB) { - // (s1 + v0) - s0 --> (s1 - s0) + v0 - if (Op0Opc == ISD::ADD) - return DAG.getNode(ISD::ADD, SL, VT, - DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); - - // (s1 - v0) - s0 --> (s1 - s0) - v0 - if (Op0Opc == ISD::SUB) - return DAG.getNode(ISD::SUB, SL, VT, - DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); - } else if (Opc == ISD::ADD && Op0Opc == ISD::SUB) { - // (s1 - v0) + s0 --> (s0 + s1) - v0 - return DAG.getNode(ISD::SUB, SL, VT, - DAG.getNode(ISD::ADD, SL, VT, Op0, Op1), Op2); - } - } - - if (Op0->isDivergent() && !Op2->isDivergent()) { - // (v0 - s1) + s0 --> (s0 - s1) + v0 - if (Opc == ISD::ADD && Op0Opc == ISD::SUB) - return DAG.getNode(ISD::ADD, SL, VT, - DAG.getNode(ISD::SUB, SL, VT, Op1, Op2), Op0); - } - } - - return SDValue(); -} - static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, @@ -11378,9 +11288,6 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return V; } - if (SDValue V = reassociateSub(N, DAG)) - return V; - if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -11423,9 +11330,6 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - if (SDValue V = reassociateSub(N, DAG)) - return V; - if (VT != MVT::i32) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index f35b637..de1bc13 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -202,7 +202,6 @@ private: SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; - SDValue reassociateSub(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 7351e7e..1ee9ed2 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -662,8 +662,8 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: s_sub_i32 s0, s2, s3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; @@ -675,10 +675,10 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: @@ -711,8 +711,8 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: s_sub_i32 s0, s2, s3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; @@ -724,10 +724,10 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll b/llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll deleted file mode 100644 index fe7e17d..0000000 --- a/llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll +++ /dev/null @@ -1,305 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s - -; s0 + (s1 - v0) --> (s0 + s1) - v0 -define amdgpu_kernel void @reassoc_sub_add_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_add_s0s1v0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s2, s3, s2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_add_s0s1v0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub = sub i32 %x, %tid - %add = add i32 %y, %sub - store i32 %add, ptr addrspace(1) %arg, align 4 - ret void -} - -; s0 + (v0 - s1) --> (s0 - s1) + v0 -define amdgpu_kernel void @reassoc_sub_add_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_add_s0v0s1_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s3, s2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_add_s0v0s1_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub = sub i32 %tid, %x - %add = add i32 %y, %sub - store i32 %add, ptr addrspace(1) %arg, align 4 - ret void -} - -; (v0 - s1) + s0 --> (s0 - s1) + v0 -define amdgpu_kernel void @reassoc_sub_add_v0s1s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_add_v0s1s0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s3, s2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_add_v0s1s0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub = sub i32 %tid, %x - %add = add i32 %sub, %y - store i32 %add, ptr addrspace(1) %arg, align 4 - ret void -} - -; s0 - (s1 - v0) --> (s0 - s1) + v0 -define amdgpu_kernel void @reassoc_sub_sub_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_sub_s0s1v0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s3, s2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_sub_s0s1v0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub1 = sub i32 %x, %tid - %sub2 = sub i32 %y, %sub1 - store i32 %sub2, ptr addrspace(1) %arg, align 4 - ret void -} - -; s0 - (v0 - s1) --> (s0 + s1) - v0 -define amdgpu_kernel void @reassoc_sub_sub_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_sub_s0v0s1_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s2, s3, s2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_sub_s0v0s1_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub1 = sub i32 %tid, %x - %sub2 = sub i32 %y, %sub1 - store i32 %sub2, ptr addrspace(1) %arg, align 4 - ret void -} - -; s0 - (s1 + v0) --> (s0 - s1) - v0 -define amdgpu_kernel void @reassoc_add_sub_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_add_sub_s0s1v0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s3, s2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_add_sub_s0s1v0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %add = add i32 %x, %tid - %sub = sub i32 %y, %add - store i32 %sub, ptr addrspace(1) %arg, align 4 - ret void -} - -; s0 - (v0 + s1) --> (s0 - s1) - v0 -define amdgpu_kernel void @reassoc_add_sub_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_add_sub_s0v0s1_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s3, s2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_add_sub_s0v0s1_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %add = add i32 %tid, %x - %sub = sub i32 %y, %add - store i32 %sub, ptr addrspace(1) %arg, align 4 - ret void -} - -; (s1 + v0) - s0 --> (s1 - s0) + v0 -define amdgpu_kernel void @reassoc_add_sub_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_add_sub_s1v0s0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s3, s2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_add_sub_s1v0s0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %add = add i32 %tid, %x - %sub = sub i32 %y, %add - store i32 %sub, ptr addrspace(1) %arg, align 4 - ret void -} - -; (s1 - v0) - s0 --> (s1 - s0) - v0 -define amdgpu_kernel void @reassoc_sub_sub_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_sub_s1v0s0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s2, s3 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_sub_s1v0s0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub1 = sub i32 %x, %tid - %sub2 = sub i32 %sub1, %y - store i32 %sub2, ptr addrspace(1) %arg, align 4 - ret void -} - -; (s1 - v0) + s0 --> (s0 + s1) - v0 -define amdgpu_kernel void @reassoc_sub_add_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { -; GFX8-LABEL: reassoc_sub_add_s1v0s0_i32: -; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: reassoc_sub_add_s1v0s0_i32: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %sub = sub i32 %x, %tid - %add = add i32 %sub, %y - store i32 %add, ptr addrspace(1) %arg, align 4 - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() -- 2.7.4