From 7b9f620e78464626fad36e629f5d053892e1cf8c Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 6 Apr 2022 17:16:50 +0100 Subject: [PATCH] [AMDGPU] Work around GFX11 flat scratch SVS swizzling bug Differential Revision: https://reviews.llvm.org/D127635 --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 22 +++++++++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 + .../Target/AMDGPU/AMDGPUInstructionSelector.cpp | 21 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 2 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 56 +++++++++++----------- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 12 ++--- 7 files changed, 84 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e32d0c8..fc55e29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1805,6 +1805,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, return true; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( + SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + KnownBits VKnown = CurDAG->computeKnownBits(VAddr); + KnownBits SKnown = KnownBits::computeForAddSub( + true, false, CurDAG->computeKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const { @@ -1832,6 +1850,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) + return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); return true; } @@ -1854,6 +1874,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; } + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) + return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 21f97f2..93d43e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -188,6 +188,8 @@ private: SDValue &VOffset, SDValue &Offset) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; + bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, + uint64_t ImmOffset) const; bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a01582c..f20cd8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3985,6 +3985,24 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { }}; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( + Register VAddr, Register SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + auto VKnown = KnownBits->getKnownBits(VAddr); + auto SKnown = KnownBits::computeForAddSub( + true, false, KnownBits->getKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); @@ -4013,6 +4031,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) + return None; + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = LHSDef->MI->getOperand(1).getIndex(); return {{ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 6a101f9..dd74a26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -210,6 +210,8 @@ private: InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; + bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr, + uint64_t ImmOffset) const; InstructionSelector::ComplexRendererFns selectScratchSVAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a710235..4f54e76 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1044,6 +1044,8 @@ public: bool hasVOPDInsts() const { return HasVOPDInsts; } + bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } + /// Return true if the target has the S_DELAY_ALU instruction. bool hasDelayAlu() const { return GFX11Insts; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 3dd8dcf..32297e8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -51,12 +51,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -132,17 +132,17 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -311,12 +311,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -395,18 +395,18 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -576,16 +576,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 2 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v2, v3, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -664,18 +665,19 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v3, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index a8e97b5..c49c617 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4091,11 +4091,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_movk_i32 s0, 0xef7f -; GFX11-NEXT: scratch_store_b8 v0, v1, s0 dlc +; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, s0 glc dlc +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4149,11 +4149,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: s_movk_i32 s0, 0xef7f -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, s0 glc dlc +; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] bb: -- 2.7.4