From 0bc14a0a989fe4268b899100aafc07e3d94decbb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 19 Jan 2022 12:42:56 +0000
Subject: [PATCH] [AMDGPU] Tweak some compares in wqm.ll test

This prevents the compares from being optimized away when D86578 lands,
which seems unintended. Also fixed some unused results.
---
 llvm/test/CodeGen/AMDGPU/wqm.ll | 78 ++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index fcab9ad..bd2f545 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -360,7 +360,7 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB9_2
@@ -381,7 +381,7 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB9_2
@@ -401,7 +401,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -425,7 +425,7 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB10_2
@@ -445,7 +445,7 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB10_2
@@ -464,7 +464,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -548,7 +548,7 @@ define amdgpu_ps float @test_wwm6_then() {
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB12_2
@@ -571,7 +571,7 @@ define amdgpu_ps float @test_wwm6_then() {
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB12_2
@@ -590,7 +590,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -617,7 +617,8 @@ define amdgpu_ps float @test_wwm6_loop() {
 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-W64-NEXT:  .LBB13_1: ; %loop
 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -644,8 +645,9 @@ define amdgpu_ps float @test_wwm6_loop() {
 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
+; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 ; GFX10-W32-NEXT:  .LBB13_1: ; %loop
 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
@@ -672,7 +674,7 @@ main_body:
   br label %loop
 
 loop:
-  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
+  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
   %src1 = load volatile float, float addrspace(1)* undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
@@ -817,7 +819,7 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB17_2
@@ -839,7 +841,7 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB17_2
@@ -860,7 +862,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -884,7 +886,7 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB18_2
@@ -905,7 +907,7 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB18_2
@@ -925,7 +927,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -1013,7 +1015,7 @@ define amdgpu_ps float @test_strict_wqm6_then() {
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB20_2
@@ -1038,7 +1040,7 @@ define amdgpu_ps float @test_strict_wqm6_then() {
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB20_2
@@ -1058,7 +1060,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -1086,7 +1088,8 @@ define amdgpu_ps float @test_strict_wqm6_loop() {
 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-W64-NEXT:  .LBB21_1: ; %loop
 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1116,8 +1119,9 @@ define amdgpu_ps float @test_strict_wqm6_loop() {
 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
+; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 ; GFX10-W32-NEXT:  .LBB21_1: ; %loop
 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
@@ -1146,7 +1150,7 @@ main_body:
   br label %loop
 
 loop:
-  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
+  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
   %src1 = load volatile float, float addrspace(1)* undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
@@ -2464,7 +2468,7 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_2
@@ -2485,7 +2489,7 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_2
@@ -2505,7 +2509,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -2529,7 +2533,7 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB40_2
@@ -2549,7 +2553,7 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB40_2
@@ -2568,7 +2572,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -2652,7 +2656,7 @@ define amdgpu_ps float @test_strict_wwm6_then() {
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB42_2
@@ -2675,7 +2679,7 @@ define amdgpu_ps float @test_strict_wwm6_then() {
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
+; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB42_2
@@ -2694,7 +2698,7 @@ main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
-  %cc = icmp uge i32 %hi, 32
+  %cc = icmp uge i32 %hi, 16
   br i1 %cc, label %endif, label %if
 
 if:
@@ -2717,7 +2721,8 @@ define amdgpu_ps float @test_strict_wwm6_loop() {
 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-W64-NEXT:  .LBB43_1: ; %loop
 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2744,8 +2749,9 @@ define amdgpu_ps float @test_strict_wwm6_loop() {
 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
+; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 ; GFX10-W32-NEXT:  .LBB43_1: ; %loop
 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2772,7 +2778,7 @@ main_body:
   br label %loop
 
 loop:
-  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
+  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
   %src1 = load volatile float, float addrspace(1)* undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
-- 
2.7.4