From: Mirko Brkusanin Date: Wed, 9 Sep 2020 09:28:36 +0000 (+0200) Subject: [AMDGPU] Workaround for LDS Misalignment bug on GFX10 X-Git-Tag: llvmorg-13-init~12579 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=43af2a6faa272565cde4e3eec7dfeac593d29701;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Workaround for LDS Misalignment bug on GFX10 Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test. --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 37e4b56..3e8cd60 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -163,7 +163,7 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "LDSMisalignedBug", "true", - "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" + "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode" >; def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug", @@ -929,6 +929,7 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, + FeatureLdsMisalignedBug, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ad9c4d0..26fbab6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1417,8 +1417,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( } if (Size == 96) { // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = - Alignment >= Align(Subtarget->hasUnalignedDSAccess() ? 4 : 16); + bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() && + !Subtarget->hasLDSMisalignedBug()) + ? 4 + : 16); if (IsFast) *IsFast = Aligned; @@ -1428,8 +1430,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we // can do a 8 byte aligned, 16 byte access in a single operation using // ds_read2/write2_b64. - bool Aligned = - Alignment >= Align(Subtarget->hasUnalignedDSAccess() ? 4 : 8); + bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() && + !Subtarget->hasLDSMisalignedBug()) + ? 4 + : 8); if (IsFast) *IsFast = Aligned; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll new file mode 100644 index 0000000..7d5a49c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -0,0 +1,128 @@ +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s + +; GCN-LABEL: test_local_misaligned_v2: +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_write2_b32 +define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* + %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4 + %v1 = extractelement <2 x i32> %load, i32 0 + %v2 = extractelement <2 x i32> %load, i32 1 + %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 + %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 + store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: test_local_misaligned_v4: +; VECT-DAG: ds_read_b128 +; VECT-DAG: ds_write_b128 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write2_b32 +define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 + %v1 = extractelement <4 x i32> %load, i32 0 + %v2 = extractelement <4 x i32> %load, i32 1 + %v3 = extractelement <4 x i32> %load, i32 2 + %v4 = extractelement <4 x i32> %load, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 + %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 + %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 + %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 + store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: test_local_misaligned_v3: +; VECT-DAG: ds_read_b96 +; VECT-DAG: ds_write_b96 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write_b32 +define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 + %v1 = extractelement <3 x i32> %load, i32 0 + %v2 = extractelement <3 x i32> %load, i32 1 + %v3 = extractelement <3 x i32> %load, i32 2 + %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 + %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 + %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 + store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: test_local_aligned_v2: +; GCN-DAG: ds_read_b64 +; GCN-DAG: ds_write_b64 +define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* + %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8 + %v1 = extractelement <2 x i32> %load, i32 0 + %v2 = extractelement <2 x i32> %load, i32 1 + %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 + %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 + store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8 + ret void +} + +; GCN-LABEL: test_local_aligned_v3: +; GCN-DAG: ds_read_b96 +; GCN-DAG: ds_write_b96 +define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 + %v1 = extractelement <3 x i32> %load, i32 0 + %v2 = extractelement <3 x i32> %load, i32 1 + %v3 = extractelement <3 x i32> %load, i32 2 + %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 + %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 + %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 + store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16 + ret void +} + +; GCN-LABEL: test_local_v4_aligned8: +; GCN-DAG: ds_read_b128 +; GCN-DAG: ds_write_b128 +define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 + %v1 = extractelement <4 x i32> %load, i32 0 + %v2 = extractelement <4 x i32> %load, i32 1 + %v3 = extractelement <4 x i32> %load, i32 2 + %v4 = extractelement <4 x i32> %load, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 + %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 + %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 + %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 + store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 975e230..1e5dcff 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s @@ -21,8 +21,12 @@ bb: } ; GCN-LABEL: test_local_misaligned_v4: -; GCN-DAG: ds_read_b128 -; GCN-DAG: ds_write_b128 +; VECT-DAG: ds_read_b128 +; VECT-DAG: ds_write_b128 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write2_b32 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -42,8 +46,12 @@ bb: } ; GCN-LABEL: test_local_misaligned_v3: -; GCN-DAG: ds_read_b96 -; GCN-DAG: ds_write_b96 +; VECT-DAG: ds_read_b96 +; VECT-DAG: ds_write_b96 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write_b32 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x()