[AMDGPU] Disable optimizeEndCf at -O0
authorChristudasan Devadasan <Christudasan.Devadasan@amd.com>
Fri, 7 Jan 2022 16:52:00 +0000 (11:52 -0500)
committerChristudasan Devadasan <Christudasan.Devadasan@amd.com>
Tue, 18 Jan 2022 07:48:52 +0000 (02:48 -0500)
Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D116819

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

index 6ec37b3..e1018bd 100644 (file)
@@ -56,6 +56,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -90,6 +91,8 @@ private:
   unsigned OrSaveExecOpc;
   unsigned Exec;
 
+  bool EnableOptimizeEndCf = false;
+
   bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
 
   void emitIf(MachineInstr &MI);
@@ -579,7 +582,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
 void SILowerControlFlow::optimizeEndCf() {
   // If the only instruction immediately following this END_CF is an another
   // END_CF in the only successor we can avoid emitting exec mask restore here.
-  if (!RemoveRedundantEndcf)
+  if (!EnableOptimizeEndCf)
     return;
 
   for (MachineInstr *MI : reverse(LoweredEndCf)) {
@@ -807,6 +810,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
+  EnableOptimizeEndCf =
+      RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None;
 
   // This doesn't actually need LiveIntervals, but we can preserve them.
   LIS = getAnalysisIfAvailable<LiveIntervals>();
index 718fbdc..23879e5 100644 (file)
@@ -1,5 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
+; Disabled endcf collapse at -O0.
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
+
 ; GCN-LABEL: {{^}}simple_nested_if:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]]
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]]
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
-
+;
+; GCN-O0-LABEL: {{^}}simple_nested_if:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0-NEXT: {{^}}[[ENDIF_INNER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0:      ds_write_b32
+; GCN-O0:      s_endpgm
+;
 define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,6 +79,38 @@ bb.outer.end:                                     ; preds = %bb.outer.then, %bb.
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
+;
+; GCN-O0-LABEL: {{^}}uncollapsable_nested_if:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0-NEXT: s_branch [[ENDIF_INNER]]
+; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_branch [[LAST_BB:.LBB[0-9_]+]]
+; GCN-O0-NEXT: {{^}}[[ENDIF_INNER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0:      s_branch [[ENDIF_OUTER]]
+; GCN-O0-NEXT: {{^}}[[LAST_BB]]:
+; GCN-O0:      ds_write_b32
+; GCN-O0:      s_endpgm
+;
 define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -95,6 +157,48 @@ bb.outer.end:                                     ; preds = %bb.inner.then, %bb
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
+;
+; GCN-O0-LABEL: {{^}}nested_if_if_else:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: s_branch [[TEMP_BB:.LBB[0-9_]+]]
+; GCN-O0-NEXT: {{^}}[[THEN_INNER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[THEN_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[THEN_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0-NEXT: s_branch [[ENDIF_INNER]]
+; GCN-O0-NEXT: {{^}}[[TEMP_BB]]:
+; GCN-O0:      s_branch [[THEN_INNER]]
+; GCN-O0-NEXT: {{^}}[[ENDIF_INNER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0:      ds_write_b32
+; GCN-O0:      s_endpgm
+;
 define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -151,6 +255,61 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
 ; GCN:      s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
 ; GCN:      ds_write_b32
 ; GCN:      s_endpgm
+;
+; GCN-O0-LABEL: {{^}}nested_if_else_if:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: s_branch [[INNER_IF_OUTER_ELSE:.LBB[0-9_]+]]
+; GCN-O0-NEXT: {{^}}[[THEN_OUTER]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0-NEXT: s_branch [[FLOW1]]
+; GCN-O0-NEXT: {{^}}[[INNER_IF_OUTER_ELSE]]
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0-NEXT: {{^}}[[THEN_OUTER_FLOW]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_IF_OUTER_ELSE_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_IF_OUTER_ELSE_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_branch [[THEN_OUTER]]
+; GCN-O0-NEXT: {{^}}[[FLOW1]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[ELSE_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[ELSE_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: {{^}}[[ENDIF_OUTER]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_2_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_2_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0:      ds_write_b32
+; GCN-O0:      s_endpgm
+;
 define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -195,6 +354,23 @@ bb.outer.end:
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]]
 ; GCN:      s_barrier
 ; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: {{^}}s_endpgm_unsafe_barrier:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0:      store_dword
+; GCN-O0-NEXT: {{^}}[[ENDIF]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0:      s_barrier
+; GCN-O0:      s_endpgm
+;
 define amdgpu_kernel void @s_endpgm_unsafe_barrier(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -238,6 +414,75 @@ bb.end:                                           ; preds = %bb.then, %bb
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: s_setpc_b64
+;
+; GCN-O0-LABEL: {{^}}scc_liveness:
+; GCN-O0-COUNT-2: buffer_store_dword
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]]
+; GCN-O0: buffer_load_dword
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0:      s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_mov_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]]
+; GCN-O0-NEXT: ; %bb.{{[0-9]+}}:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]]
+; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[FLOW2:.LBB[0-9_]+]]
+; GCN-O0: {{^}}[[FLOW2]]:
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]]
+; GCN-O0:      s_branch [[FLOW:.LBB[0-9_]+]]
+; GCN-O0: {{^}}[[FLOW]]:
+; GCN-O0:      s_mov_b64 s[{{[0-9:]+}}], exec
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execz [[FLOW3:.LBB[0-9_]+]]
+; GCN-O0:      ; %bb.{{[0-9]+}}:
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]]
+; GCN-O0: {{^}}[[FLOW3]]:
+; GCN-O0-COUNT-4: buffer_load_dword
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]]
+; GCN-O0:      s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN-O0-COUNT-2: s_mov_b64
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]]
+; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]]
+; GCN-O0-COUNT-4: buffer_store_dword
+; GCN-O0:      s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]]
+; GCN-O0:      ; %bb.{{[0-9]+}}:
+; GCN-O0-COUNT-4: buffer_store_dword
+; GCN-O0:     s_setpc_b64
+;
 define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 bb:
   br label %bb1