[AMDGPU] Mark s_barrier as having side effects but not accessing memory.
authorJay Foad <jay.foad@gmail.com>
Fri, 6 Sep 2019 10:07:28 +0000 (10:07 +0000)
committerJay Foad <jay.foad@gmail.com>
Fri, 6 Sep 2019 10:07:28 +0000 (10:07 +0000)
Summary:
This fixes poor scheduling in a function containing a barrier and a few
load instructions.

Without this fix, ScheduleDAGInstrs::buildSchedGraph adds an artificial
edge in the dependency graph from the barrier instruction to the exit
node representing live-out latency, with a latency of about 500 cycles.
Because of this it thinks the critical path through the graph also has
a latency of about 500 cycles. And because of that it does not think
that any of the load instructions are on the critical path, so it
schedules them with no regard for their (80 cycle) latency, which gives
poor results.

Reviewers: arsenm, dstuttard, tpr, nhaehnle

Subscribers: kzhuravl, jvesely, wdng, yaxunl, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D67218

llvm-svn: 371192

llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
llvm/test/CodeGen/AMDGPU/schedule-barrier.mir [new file with mode: 0644]
llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll

index 14605a433e5e9561ccd59acced66e27271ed07d6..29a3d2460eb8e7fac2cdcd1ae2ccada3b69cba6e 100644 (file)
@@ -206,7 +206,7 @@ def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
   [ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
 
 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
-  Intrinsic<[], [], [IntrConvergent]>;
+  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent]>;
 
 def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
   Intrinsic<[], [], [IntrConvergent]>;
index d2d783ba76cdd048473ccd82835233f7bf1aa625..34082eb0cd4da32d0cfafe934e79b7448bf8ae66 100644 (file)
@@ -1076,8 +1076,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   [(int_amdgcn_s_barrier)]> {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
-  let mayLoad = 1;
-  let mayStore = 1;
   let isConvergent = 1;
 }
 
index 2dd7e20c00ccf77915c389c86b0f0c696b7d276c..0bf3125b22bf68273300f1019453e2f218694d31 100644 (file)
@@ -16,9 +16,9 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 ; VARIANT0-NEXT:    v_mov_b32_e32 v2, 0
 ; VARIANT0-NEXT:    s_waitcnt lgkmcnt(0)
 ; VARIANT0-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT0-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
 ; VARIANT0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; VARIANT0-NEXT:    s_barrier
+; VARIANT0-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
 ; VARIANT0-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; VARIANT0-NEXT:    v_lshl_b64 v[3:4], v[3:4], 2
 ; VARIANT0-NEXT:    buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
@@ -37,8 +37,8 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 ; VARIANT1-NEXT:    v_mov_b32_e32 v2, 0
 ; VARIANT1-NEXT:    s_waitcnt lgkmcnt(0)
 ; VARIANT1-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT1-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
 ; VARIANT1-NEXT:    s_barrier
+; VARIANT1-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
 ; VARIANT1-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; VARIANT1-NEXT:    v_lshl_b64 v[3:4], v[3:4], 2
 ; VARIANT1-NEXT:    s_waitcnt expcnt(0)
@@ -50,46 +50,46 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 ; VARIANT2-LABEL: test_barrier:
 ; VARIANT2:       ; %bb.0: ; %entry
 ; VARIANT2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VARIANT2-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT2-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VARIANT2-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; VARIANT2-NEXT:    s_waitcnt lgkmcnt(0)
-; VARIANT2-NEXT:    v_mov_b32_e32 v2, s3
-; VARIANT2-NEXT:    v_add_co_u32_e32 v1, vcc, s2, v1
-; VARIANT2-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; VARIANT2-NEXT:    global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT:    v_mov_b32_e32 v4, s3
+; VARIANT2-NEXT:    v_xad_u32 v1, v0, -1, s0
+; VARIANT2-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT2-NEXT:    v_add_co_u32_e32 v3, vcc, s2, v3
+; VARIANT2-NEXT:    v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT2-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; VARIANT2-NEXT:    global_store_dword v[3:4], v0, off
+; VARIANT2-NEXT:    v_mov_b32_e32 v5, s3
+; VARIANT2-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v1
+; VARIANT2-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
 ; VARIANT2-NEXT:    s_waitcnt vmcnt(0)
 ; VARIANT2-NEXT:    s_barrier
-; VARIANT2-NEXT:    v_xad_u32 v3, v0, -1, s0
-; VARIANT2-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; VARIANT2-NEXT:    v_lshlrev_b64 v[3:4], 2, v[3:4]
-; VARIANT2-NEXT:    v_mov_b32_e32 v0, s3
-; VARIANT2-NEXT:    v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT2-NEXT:    v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
-; VARIANT2-NEXT:    global_load_dword v0, v[3:4], off
+; VARIANT2-NEXT:    global_load_dword v0, v[0:1], off
 ; VARIANT2-NEXT:    s_waitcnt vmcnt(0)
-; VARIANT2-NEXT:    global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT:    global_store_dword v[3:4], v0, off
 ; VARIANT2-NEXT:    s_endpgm
 ;
 ; VARIANT3-LABEL: test_barrier:
 ; VARIANT3:       ; %bb.0: ; %entry
 ; VARIANT3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VARIANT3-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT3-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VARIANT3-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; VARIANT3-NEXT:    s_waitcnt lgkmcnt(0)
-; VARIANT3-NEXT:    v_mov_b32_e32 v2, s3
-; VARIANT3-NEXT:    v_add_co_u32_e32 v1, vcc, s2, v1
-; VARIANT3-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; VARIANT3-NEXT:    global_store_dword v[1:2], v0, off
-; VARIANT3-NEXT:    s_barrier
-; VARIANT3-NEXT:    v_xad_u32 v3, v0, -1, s0
-; VARIANT3-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; VARIANT3-NEXT:    v_lshlrev_b64 v[3:4], 2, v[3:4]
-; VARIANT3-NEXT:    v_mov_b32_e32 v0, s3
+; VARIANT3-NEXT:    v_mov_b32_e32 v4, s3
+; VARIANT3-NEXT:    v_xad_u32 v1, v0, -1, s0
+; VARIANT3-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; VARIANT3-NEXT:    v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT3-NEXT:    v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
-; VARIANT3-NEXT:    global_load_dword v0, v[3:4], off
+; VARIANT3-NEXT:    v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT3-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; VARIANT3-NEXT:    global_store_dword v[3:4], v0, off
+; VARIANT3-NEXT:    v_mov_b32_e32 v5, s3
+; VARIANT3-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v1
+; VARIANT3-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT3-NEXT:    s_barrier
+; VARIANT3-NEXT:    global_load_dword v0, v[0:1], off
 ; VARIANT3-NEXT:    s_waitcnt vmcnt(0)
-; VARIANT3-NEXT:    global_store_dword v[1:2], v0, off
+; VARIANT3-NEXT:    global_store_dword v[3:4], v0, off
 ; VARIANT3-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
index a070488a4bc05f4f073112d9cc60dc16426e916d..f6611ec26a8237515a250f479392968e47051aff 100644 (file)
@@ -38,18 +38,16 @@ entry:
 
 ; GCN-LABEL: {{^}}local_memory_two_objects:
 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
+; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-
-; GCN: s_barrier
-
 ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
 ; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
 
+; GCN: s_barrier
+
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
-
-; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
 
 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
new file mode 100644 (file)
index 0000000..a72a406
--- /dev/null
@@ -0,0 +1,46 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+# Check that the high latency loads are both scheduled first, before the
+# multiplies, despite the presence of a barrier in the function.
+# CHECK: BUFFER_LOAD_DWORD_OFFSET
+# CHECK: BUFFER_LOAD_DWORD_OFFSET
+# CHECK: V_MUL_LO_U32
+# CHECK: V_MUL_LO_U32
+name: test
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+
+    undef %43.sub3:vreg_128 = COPY $vgpr9
+    undef %42.sub2:vreg_128 = COPY $vgpr8
+    undef %41.sub1:vreg_128 = COPY $vgpr7
+    undef %26.sub0:vreg_128 = COPY $vgpr6
+    undef %46.sub3:vreg_128 = COPY $vgpr5
+    undef %45.sub2:vreg_128 = COPY $vgpr4
+    undef %44.sub1:vreg_128 = COPY $vgpr3
+    undef %32.sub0:vreg_128 = COPY $vgpr2
+    undef %38.sub1:vreg_64 = COPY $vgpr1
+    %38.sub0:vreg_64 = COPY $vgpr0
+
+    S_BARRIER
+
+    undef %33.sub0:sgpr_128 = V_READFIRSTLANE_B32 %32.sub0, implicit $exec
+    %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec
+    %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec
+    %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec
+    %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec
+    %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec
+
+    undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec
+    %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec
+    %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec
+    %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec
+    %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec
+    %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec
+
+    %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec
+    GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
index f2c2cc847c71f83d50cff5548a430b2e7b87c293..e503fbb51ad76ec3301752176a86c4e62f188e30 100644 (file)
@@ -304,14 +304,17 @@ entry:
 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+
+; SI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
 ; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
 ; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
index c8777f441f2a764e00e6519f0d6adfed3018eb64..4cbd89147722ba07a9ccb48a2c5e22cd214cb0bd 100644 (file)
@@ -5,8 +5,8 @@
 ; GCN-LABEL: barrier_vmcnt_global:
 ; GFX8:         flat_load_dword
 ; GFX9_10:      global_load_dword
-; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
+; GFX8:         s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10:      s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:     s_barrier
 define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
 bb:
@@ -28,9 +28,9 @@ bb:
 ; GCN-LABEL: barrier_vscnt_global:
 ; GFX8:       flat_store_dword
 ; GFX9_10:    global_store_dword
-; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9-NEXT:  s_waitcnt vmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9:       s_waitcnt vmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
 define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
 bb:
@@ -54,9 +54,9 @@ bb:
 ; GCN-LABEL: barrier_vmcnt_vscnt_global:
 ; GFX8:         flat_load_dword
 ; GFX9_10:      global_load_dword
-; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
-; GFX10-NEXT:   s_waitcnt_vscnt null, 0x0
+; GFX8:         s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10:      s_waitcnt vmcnt(0){{$}}
+; GFX10:        s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:     s_barrier
 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
 bb:
@@ -81,7 +81,7 @@ bb:
 
 ; GCN-LABEL: barrier_vmcnt_flat:
 ; GCN:      flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT: s_barrier
 define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
 bb:
@@ -102,9 +102,9 @@ bb:
 
 ; GCN-LABEL: barrier_vscnt_flat:
 ; GCN:         flat_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT:  s_waitcnt lgkmcnt(0){{$}}
-; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GFX8_9:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:       s_waitcnt lgkmcnt(0){{$}}
+; GFX10:       s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_barrier
 define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
 bb:
@@ -127,8 +127,8 @@ bb:
 
 ; GCN-LABEL: barrier_vmcnt_vscnt_flat:
 ; GCN:        flat_load_dword
-; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
 bb:
@@ -153,8 +153,8 @@ bb:
 
 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 ; GCN:        flat_load_dword
-; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
 bb:
@@ -218,8 +218,8 @@ bb:
 
 ; GCN-LABEL: store_vscnt_private:
 ; GCN:         buffer_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GFX8_9:      s_waitcnt vmcnt(0)
+; GFX10:       s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64
 define void @store_vscnt_private(i32 addrspace(5)* %p) {
   store i32 0, i32 addrspace(5)* %p
@@ -229,8 +229,8 @@ define void @store_vscnt_private(i32 addrspace(5)* %p) {
 ; GCN-LABEL: store_vscnt_global:
 ; GFX8:        flat_store_dword
 ; GFX9_10:     global_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GFX8_9:      s_waitcnt vmcnt(0)
+; GFX10:       s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64
 define void @store_vscnt_global(i32 addrspace(1)* %p) {
   store i32 0, i32 addrspace(1)* %p
@@ -239,9 +239,9 @@ define void @store_vscnt_global(i32 addrspace(1)* %p) {
 
 ; GCN-LABEL: store_vscnt_flat:
 ; GCN:         flat_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT:  s_waitcnt lgkmcnt(0){{$}}
-; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GFX8_9:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:       s_waitcnt lgkmcnt(0){{$}}
+; GFX10:       s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64
 define void @store_vscnt_flat(i32* %p) {
   store i32 0, i32* %p
@@ -249,8 +249,8 @@ define void @store_vscnt_flat(i32* %p) {
 }
 
 ; GCN-LABEL: function_prologue:
-; GCN-NEXT:   s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_setpc_b64
 define void @function_prologue() {
   ret void