From c2814e12e7fd1d51caa3c14a7053dedadd99152c Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 17 Apr 2019 16:31:52 +0000 Subject: [PATCH] AMDGPU: Force skip over SMRD, VMEM and s_waitcnt instructions Summary: This fixes a large Dawn of War 3 performance regression with RADV from Mesa 19.0 to master which was caused by creating less code in some branches. Reviewers: arsen, nhaehnle Reviewed By: nhaehnle Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60824 llvm-svn: 358592 --- llvm/lib/Target/AMDGPU/SIInsertSkips.cpp | 4 ++++ llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 6 ++++++ llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 1 + llvm/test/CodeGen/AMDGPU/valu-i1.ll | 3 +++ llvm/test/CodeGen/AMDGPU/wqm.ll | 1 + 5 files changed, 15 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 9ea05d9..f5f1f99 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -137,6 +137,10 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + ++NumInstr; if (NumInstr >= SkipThreshold) return true; diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 27cbd14..a425bcf 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -7,6 +7,7 @@ ; GCN-NEXT: s_cbranch_execz [[ENDIF]] ; GCN: s_and_b64 exec, exec, vcc ; GCN-NEXT: ; mask branch [[ENDIF]] +; GCN-NEXT: s_cbranch_execz [[ENDIF]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF]]: @@ -46,6 +47,7 @@ bb.outer.end: ; preds = %bb.outer.then, %bb. ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] ; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] ; GCN-NEXT: ; mask branch [[ENDIF_INNER:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_INNER]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_INNER]]: @@ -91,6 +93,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] ; GCN-NEXT: s_xor_b64 [[SAVEEXEC_INNER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_INNER]] ; GCN-NEXT: ; mask branch [[THEN_INNER:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[THEN_INNER]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[THEN_INNER]]: @@ -140,6 +143,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN: store_dword ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]] ; GCN-NEXT: ; mask branch [[THEN_OUTER_FLOW:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]: @@ -153,6 +157,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN: store_dword ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]] ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[FLOW1]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: [[FLOW1]]: @@ -199,6 +204,7 @@ bb.outer.end: ; ALL-LABEL: {{^}}s_endpgm_unsafe_barrier: ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[ENDIF]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF]]: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index a02e6f8..e848e75 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -367,6 +367,7 @@ bb7: ; preds = %bb4 ; CHECK: v_cmp_neq_f32_e32 vcc, 0, ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: s_cbranch_execz [[END]] ; CHECK-NOT: branch ; CHECK: BB{{[0-9]+_[0-9]+}}: ; %bb8 diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index c64d4fc..3a9970e 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -66,6 +66,7 @@ end: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: s_cbranch_execz [[EXIT]] ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword @@ -92,6 +93,7 @@ exit: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: s_cbranch_execz [[EXIT]] ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword @@ -129,6 +131,7 @@ exit: ; SI-NEXT: s_or_saveexec_b64 ; SI-NEXT: s_xor_b64 exec, exec ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] +; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN]] ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then ; SI: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 3d18183..842cf25 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -423,6 +423,7 @@ END: ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] ;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] +;CHECK-NEXT: s_cbranch_execz [[END_BB]] ;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE ;CHECK: store_dword ;CHECK: [[END_BB]]: ; %END -- 2.7.4